Author: Mengna Zhang
Last updated on: 2025-09-15
## directory: can extend to the main CNT folder
directory <- "/Users/"
## your own directory
my_directory <- paste0(directory, "mengnazhang/Desktop/")
## set PRADI path (the folder where PRADI raw files located)
pradi_directory <- paste0(my_directory, "ADSP_DataPrep_local/PRADI/Phenotype/2025/Raw/")
## output path
out_directory <- paste0(my_directory, "ADSP_DataPrep_local/PRADI/Phenotype/2025/Cleaned/")
## script path
script_directory <- paste0(my_directory, "ADSP_DataPrep_local/PRADI/Phenotype/2025/Scripts/")
## revised DD path
revisedDDpath <- paste0(out_directory,"colnamesPerSubdata.xlsx")
source("/Users/mengnazhang/Desktop/ADSP_DataPrep/dataPrep2025/helperScripts_PRADI.R")
# source(paste0(script_directory,"helperScripts_WRAP.R"))
require(dplyr)
require(readxl)
require(openxlsx)
require(stringr)
require(tidyr)
require(lubridate)
require(ggplot2)
`%!in%` <- Negate(`%in%`)
file_list <- paste0(out_directory,"filelist.txt")
## read all lines (file names) from the file
file_names <- readLines(file_list)
## loop over each file name
for (fname in file_names) {
## Extract the clean name by removing prefix and suffix
clean_name <- sub("^PRADI_", "", fname)
clean_name <- sub("_05122025\\.xlsx$", "", clean_name)
## read the Excel file and convert to data.frame
data <- as.data.frame(read_excel(paste0(pradi_directory, fname),sheet = "Export Worksheet"))
## assign to a variable with the clean name in the global environment
assign(clean_name, data, envir = .GlobalEnv)
rm(data)
}
df_names <- ls()[sapply(mget(ls(), .GlobalEnv), is.data.frame)]
## extract the column names for each data frame
column_lists <- lapply(df_names, function(name) colnames(get(name)))
## find common columns across all data frames
common_cols <- Reduce(intersect, column_lists)
# Print the result
print(common_cols)
## [1] "SYSXM" "SYSIND" "SYSGP" "SYSGPSTUDY"
## [5] "SYSINDGP" "CGI_ORDER" "GPS_ORDER" "STDCGI_ORDER"
## [9] "LSTUDY" "DB_OWNER" "STUDY" "SUBSTUDY"
## [13] "CENTER" "GP" "IND" "REFCTR"
## [17] "DATE_OF_BIRTH"
## This code only needs to be run once.
## It will generate an Excel file where each sheet contains the column names for its corresponding dataset.
## Then I will use this excel and fill the infor for each variable to generate the revise DD:
# Create a new workbook
wb <- createWorkbook()
# For each data frame, add a sheet with its column names
# Loop through each data frame
for (df_name in df_names) {
df <- get(df_name) # get the actual data frame
col_names_df <- data.frame(VarNames = colnames(df)) # create single-column df
# Add sheet with df name (truncated to 31 characters max)
sheet_name <- substr(df_name, 1, 31)
addWorksheet(wb, sheetName = sheet_name)
# Write the column names into the sheet
writeData(wb, sheet = sheet_name, col_names_df)
}
# Save the workbook
## define the name and location of this file to save
saveWorkbook(wb, file = colnames_file, overwrite = TRUE)
df <- AAAD_GERIAT
info(AAAD_GERIAT,"SYSIND")
## #obs:1051, cols:62, inds:939
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "logical"
##
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
## 'data.frame': 1051 obs. of 62 variables:
## $ SYSXM : num 7534713 7540453 7540583 7540653 7540803 ...
## $ SYSIND : num 11108883 11006263 11048913 11048883 11059623 ...
## $ SYSGP : num 7920393 7888673 7896183 7896183 7897223 ...
## $ SYSGPSTUDY : num 1357713 1304013 1311503 1311503 1312543 ...
## $ SYSINDGP : num 7868403 7761063 7804773 7804743 7818553 ...
## $ CGI_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ GPS_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ STDCGI_ORDER : num 11 11 11 11 11 11 11 11 11 11 ...
## $ LSTUDY : chr "ADCRLPRADI" "ADCRLPRADI" "ADFAMPRADI" "ADFAMPRADI" ...
## $ DB_OWNER : chr "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
## $ STUDY : chr "ALZ" "ALZ" "ALZ" "ALZ" ...
## $ SUBSTUDY : chr "ADCRLPRADI" "ADCRLPRADI" "ADFAMPRADI" "ADFAMPRADI" ...
## $ CENTER : chr "IHG" "IHG" "IHG" "IHG" ...
## $ GP : num 87634 87534 87657 87657 87699 ...
## $ IND : num 1 104 102 1000 101 108 1 1 1 1 ...
## $ REFCTR : logi NA NA NA NA NA NA ...
## $ EXAM_DATE : POSIXct, format: "2018-01-12" "2018-02-21" ...
## $ EXAMINER : chr "axr1589" "v.rodriguez4" "v.rodriguez4" "axr1589" ...
## $ DATE_OF_BIRTH : POSIXct, format: "1950-06-03" "1936-09-20" ...
## $ AGE_AT_EXAM : num 67 81 72 94 88 68 67 71 68 69 ...
## $ SATISFIED_LIFE : chr "Y" "Y" "Y" "Y" ...
## $ DROPPED_ACTIVITIES : chr "N" "N" "N" "N" ...
## $ FEEL_EMPTY : chr "N" "N" "N" "N" ...
## $ GOOD_SPIRIT : chr "Y" "Y" "Y" "Y" ...
## $ AFRAID_BAD_THINGS : chr "N" "N" "N" "N" ...
## $ BORED : chr "N" "N" "N" "Y" ...
## $ FEEL_HAPPY : chr "Y" "Y" "Y" "Y" ...
## $ FEEL_HELPLESS : chr "N" "N" "N" "N" ...
## $ STAY_HOME : chr "N" "N" "N" "N" ...
## $ MEMORY_PROBLEM : chr "N" "N" "N" "N" ...
## $ ALIVE : chr "Y" "Y" "Y" "Y" ...
## $ FEEL_WORTHLESS : chr "N" "N" "N" "N" ...
## $ FEEL_FULL_ENERGY : chr "Y" "Y" "Y" "Y" ...
## $ FEEL_HOPELESS : chr "N" "N" "N" "N" ...
## $ OTHER_BETTER_OFF : chr "N" "N" "N" "N" ...
## $ TROUBLE_FALL_ASLEEP: logi NA NA NA NA NA NA ...
## $ TROUBLE_STAY_ASLEEP: logi NA NA NA NA NA NA ...
## $ SLEEPING_TOO_MUCH : logi NA NA NA NA NA NA ...
## $ APPETITE_INCREASED : logi NA NA NA NA NA NA ...
## $ APPETITE_DECREASED : logi NA NA NA NA NA NA ...
## $ WEIGHT_LOSS : logi NA NA NA NA NA NA ...
## $ AMOUNT_WEIGHT_LOSS : logi NA NA NA NA NA NA ...
## $ SATISFYING_LIFE : logi NA NA NA NA NA NA ...
## $ COMMENTS : logi NA NA NA NA NA NA ...
## $ RELIABLE : logi NA NA NA NA NA NA ...
## $ LIFE_SCORE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ACTIVITY_SCORE : num 0 0 0 0 1 0 0 0 1 1 ...
## $ EMPTY_SCORE : num 0 0 0 0 0 0 0 0 0 1 ...
## $ BORED_SCORE : num 0 0 0 1 0 0 0 0 0 0 ...
## $ SPIRIT_SCORE : num 0 0 0 0 0 0 0 0 0 1 ...
## $ AFRAID_SCORE : num 0 0 0 0 0 0 0 0 1 0 ...
## $ HAPPY_SCORE : num 0 0 0 0 0 0 1 0 0 0 ...
## $ HELPLESS_SCORE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ STAY_HOME_SCORE : num 0 0 0 0 1 1 0 0 1 0 ...
## $ MEMORY_SCORE : num 0 0 0 0 1 0 0 0 0 1 ...
## $ ALIVE_SCORE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ WORTHLESS_SCORE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FULL_ENERGY_SCORE : num 0 0 0 0 0 0 0 0 0 1 ...
## $ HOPELESS_SCORE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ BETTER_OFF_SCORE : num 0 0 0 0 0 0 1 0 0 1 ...
## $ TOTAL_STATUS : chr NA NA NA NA ...
## $ TOTAL : num 0 0 0 1 3 1 2 0 3 6 ...
dfDD <- read_excel(revisedDDpath, sheet = "AAAD_GERIAT")
## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)] ## 11 vars
## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 11 × 2
## VarNames `Data Type`
## <chr> <chr>
## 1 REFCTR VARCHAR2(6)
## 2 TROUBLE_FALL_ASLEEP <NA>
## 3 TROUBLE_STAY_ASLEEP <NA>
## 4 SLEEPING_TOO_MUCH <NA>
## 5 APPETITE_INCREASED <NA>
## 6 APPETITE_DECREASED <NA>
## 7 WEIGHT_LOSS <NA>
## 8 AMOUNT_WEIGHT_LOSS <NA>
## 9 SATISFYING_LIFE <NA>
## 10 COMMENTS <NA>
## 11 RELIABLE <NA>
## converted to character
convert2chr <-c("REFCTR")
## convert
df[convert2chr] <- lapply(df[convert2chr], as.character)
## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "POSIXct" "POSIXt"
##
## [[4]]
## [1] "logical"
## NOTE: For the other 10 variables, the DD does not provide data type information, so I’m leaving them unspecified for now.
## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE" "DATE_OF_BIRTH"
## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]
## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## character(0)
## character(0)
head(df[,datecols])
## EXAM_DATE DATE_OF_BIRTH
## 1 2018-01-12 1950-06-03
## 2 2018-02-21 1936-09-20
## 3 2018-02-19 1946-01-11
## 4 2018-02-19 1923-04-17
## 5 2018-02-18 1929-10-08
## 6 2018-02-19 1949-08-01
## convert format
df[datecols] <- lapply(df[datecols], as.Date)
## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric" "character" "Date" "logical"
## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)]
## 23 vars
## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]
mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)
## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore EXAMINER, as I assume we can have multiple examiners
## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 27 vars
## extract numeric variables from DD
## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]
mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)
unique(numColsfromDD$`Valid Responses`)
## [1] NA "1 thru 99999;" "1 thru 9999;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP
AAAD_GERIAT <- df
df <- AAAD_MEDCON
info(AAAD_MEDCON,"SYSIND")
## #obs:397, cols:256, inds:367
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "logical"
##
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
## 'data.frame': 397 obs. of 256 variables:
## $ SYSXM : num 7134193 7839953 7838803 7838853 7838933 ...
## $ SYSIND : num 11010563 11368403 11368463 11368453 11368443 ...
## $ SYSGP : num 7889553 7950923 7950983 7950973 7950963 ...
## $ SYSGPSTUDY : num 1304893 1396033 1396093 1396083 1396073 ...
## $ SYSINDGP : num 7765583 8137673 8137733 8137723 8137713 ...
## $ CGI_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ GPS_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ STDCGI_ORDER : num 11 11 11 11 11 11 11 11 11 11 ...
## $ LSTUDY : chr "ADFAMPRADI" "ADCONTROL" "ADCRLPRADI" "ADCRLPRADI" ...
## $ DB_OWNER : chr "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
## $ STUDY : chr "ALZ" "ALZ" "ALZ" "ALZ" ...
## $ SUBSTUDY : chr "ADFAMPRADI" "ADCONTROL" "ADCRLPRADI" "ADCRLPRADI" ...
## $ CENTER : chr "IHG" "IHG" "IHG" "IHG" ...
## $ GP : num 87580 88413 88419 88418 88417 ...
## $ IND : num 1 1 1 1 1 100 1 1 1 1 ...
## $ REFCTR : logi NA NA NA NA NA NA ...
## $ EXAM_DATE : POSIXct, format: "2016-09-07" "2020-03-06" ...
## $ EXAMINER : chr "ladams4" "sjt82" "mxp1257" "mxp1257" ...
## $ DATE_OF_BIRTH : POSIXct, format: "1925-08-21" "1928-02-14" ...
## $ AGE_AT_EXAM : num 91 92 74 75 75 64 83 77 79 85 ...
## $ REVIEW_DATE : logi NA NA NA NA NA NA ...
## $ REVIEWER : logi NA NA NA NA NA NA ...
## $ MEMORY_COMPLAINTS : num NA 1 0 0 0 1 0 0 0 0 ...
## $ DATE_OF_ONSET : POSIXct, format: NA NA ...
## $ DOA_UNK : chr NA "U" NA NA ...
## $ DESCRIBE : chr "hello" "No family history of dementia." "Brother with PD." "Denies psychological/psychiatric conditions. Brother, mother, aunt (father's sister) and grandparents with AD "| __truncated__ ...
## $ MEM_COMPLAINTS : chr NA "misplaces objects Informant: Daughter and self 92 YO widow female who lives with daughter and visits senior c"| __truncated__ "None. Lives alone. Can cook, clean and take care of self. Fully capable of self care and fully oriented. Drives"| __truncated__ "none. denies memoery issues. Lives with daughter. can cook, clean and do chores but daughter helps. Says she co"| __truncated__ ...
## $ CURRENT_MED : chr NA NA "Hypertension and Thyroid issues" "Hypertension + neuropathy + metformin" ...
## $ PMH : chr NA "HTN" "Hypertension and Thyroid issues" NA ...
## $ MOOD_CHANGES : chr NA "denies" "denies." NA ...
## $ MEDICATIONS : chr NA "lisinopril, clopidogrel" NA "not collected" ...
## $ HYPERTENSION_DX : num NA 1 NA NA NA 1 NA 0 1 1 ...
## $ HYPERTENSION_TREATED : num NA 1 NA NA NA 1 NA 0 NA NA ...
## $ DIABETES_DX : num NA 0 NA NA NA 1 NA 0 0 1 ...
## $ DIABETES_TREATED : num NA NA NA NA NA 1 NA 0 NA NA ...
## $ MYOCARDIAL_DX : num NA 0 NA NA NA 0 NA 0 0 1 ...
## $ MYOCARDIAL_TREATED : num NA NA NA NA NA 0 NA 0 NA NA ...
## $ HEART_FAILURE_DX : num NA 0 NA NA NA 0 NA 0 0 0 ...
## $ HEART_FAILURE_TREATED : num NA NA NA NA NA 0 NA 0 NA NA ...
## $ HEART_DISEASE_DX : num NA 0 NA NA NA 0 NA 0 0 0 ...
## $ HEART_DISEASE_TREATED : num NA NA NA NA NA 0 NA 0 NA NA ...
## $ COPD_DX : num NA 0 NA NA NA 0 NA 0 0 1 ...
## $ COPD_TREATED : num NA NA NA NA NA 0 NA 0 NA NA ...
## $ THYROID_DX : num NA 0 NA NA NA 0 NA 1 0 0 ...
## $ THYROID_TREATED : num NA NA NA NA NA 0 NA 1 NA NA ...
## $ LIVER_DX : num NA 0 NA NA NA 0 NA 0 0 0 ...
## $ LIVER_TREATED : num NA NA NA NA NA 0 NA 0 NA NA ...
## $ RENAL_DX : num NA 0 NA NA NA 0 NA 0 0 0 ...
## $ RENAL_TREATED : num NA NA NA NA NA 0 NA 0 NA NA ...
## $ PEPTIC_DX : num NA 0 NA NA NA 0 NA 0 0 0 ...
## $ PEPTIC_TREATED : num NA NA NA NA NA 0 NA 0 NA NA ...
## $ PERIPHERAL_DX : num NA 0 NA NA NA 0 NA 1 0 0 ...
## $ PERIPHERAL_TREATED : num NA NA NA NA NA 0 NA 1 NA NA ...
## $ STROKE_DX : num NA 0 NA NA NA 0 NA 0 0 0 ...
## $ STROKE_TREATED : num NA NA NA NA NA 0 NA 0 NA NA ...
## $ TIA_DX : num NA 0 NA NA NA 0 NA 0 0 1 ...
## $ TIA_TREATED : num NA NA NA NA NA 0 NA 0 NA NA ...
## $ HEAD_INJURY_DX : num NA 0 NA NA NA 0 NA 0 0 0 ...
## $ HEAD_INJURY_TREATED : num NA NA NA NA NA 0 NA 0 NA NA ...
## $ SEIZURE_DX : num NA 0 NA NA NA 0 NA 0 0 0 ...
## $ SEIZURE_TREATED : num NA NA NA NA NA 0 NA 0 NA NA ...
## $ CANCER_DX : num NA 0 NA NA NA 0 NA 0 0 0 ...
## $ CANCER_TREATED : num NA NA NA NA NA 0 NA 0 NA NA ...
## $ ARTHRITIS_DX : num NA 0 NA NA NA 0 NA 0 0 0 ...
## $ ARTHRITIS_TREATED : num NA NA NA NA NA 0 NA 0 NA NA ...
## $ SYPHILIS_DX : num NA 0 NA NA NA 0 NA 0 0 0 ...
## $ SYPHILIS_TREATED : num NA NA NA NA NA 0 NA 0 NA NA ...
## $ ALCOHOL_DX : num NA 0 NA NA NA 0 NA 0 0 1 ...
## $ ALCOHOL_TREATED : num NA NA NA NA NA 0 NA 0 0 9 ...
## $ ILLICIT_DRUG_DX : num NA 0 NA NA NA 0 NA 0 0 0 ...
## $ ILLICIT_DRUG_TREATED : num NA NA NA NA NA 0 NA 0 0 9 ...
## $ SMOKING_DX : num NA 0 NA NA NA 0 NA 0 0 1 ...
## $ SMOKING_TREATED : num NA NA NA NA NA 0 NA 0 0 9 ...
## $ PD_DX : num NA 0 NA NA NA 0 NA 0 0 0 ...
## $ PD_TREATED : num NA NA NA NA NA 0 NA 0 0 9 ...
## $ HUNTINGTON_DX : num NA 0 NA NA NA 0 NA 0 0 0 ...
## $ HUNTINGTON_TREATED : num NA NA NA NA NA 0 NA 0 0 9 ...
## $ MULTIPLE_SCLEROSIS_DX : num NA 0 NA NA NA 0 NA 0 0 0 ...
## $ MULTIPLE_SCLEROSIS_TREATED: num NA NA NA NA NA 0 NA 0 NA 9 ...
## $ B12_DX : num NA 0 NA NA NA 0 NA 0 0 0 ...
## $ B12_TREATED : num NA NA NA NA NA 0 NA 0 NA NA ...
## $ HYDROCEPHALUS_DX : num NA 0 NA NA NA 0 NA 0 0 0 ...
## $ HYDROCEPHALUS_TREATED : num NA NA NA NA NA 0 NA 0 NA NA ...
## $ TREMOR_DX : num NA 0 NA NA NA 0 NA 0 0 0 ...
## $ TREMOR_TREATED : num NA NA NA NA NA 0 NA 0 NA NA ...
## $ DOWN_SYNDROME_DX : num NA 0 NA NA NA 0 NA 0 0 0 ...
## $ DOWN_SYNDROME_TREATED : num NA NA NA NA NA 0 NA 0 NA NA ...
## $ MED_CONDITIONS_DX : num NA 0 NA NA NA 0 NA 0 0 0 ...
## $ MED_CONDITIONS_TREATED : num NA NA NA NA NA 0 NA 0 NA NA ...
## $ OTH_MED_COND_SP : chr NA NA NA NA ...
## $ STROKE_BRAIN : num NA 0 NA NA NA 0 NA 0 9 1 ...
## $ DOCTOR : num NA NA NA NA NA 0 NA 0 9 1 ...
## $ STROKE_PAST : num NA NA NA NA NA 0 NA 0 9 1 ...
## $ STROKE_24HRS : num NA NA NA NA NA 0 NA 0 9 0 ...
## $ SYMPTOMS : num NA NA NA NA NA 0 NA 0 9 0 ...
## $ LOST_SPEECH : num NA 0 NA NA NA 0 NA 0 9 9 ...
## $ LOST_UNDERSTAND : num NA 0 NA NA NA 0 NA 0 9 9 ...
## $ LOSS_CONSCIOUS : num NA 0 NA NA NA 0 NA 0 9 9 ...
## $ WEAKNESS : num NA 0 NA NA NA 0 NA 0 9 9 ...
## $ NUMBNESS : num NA 0 NA NA NA 0 NA 0 9 9 ...
## $ LOSS_VISION : num NA 0 NA NA NA 0 NA 0 9 9 ...
## $ HALF_VISION : num NA 0 NA NA NA 0 NA 0 9 9 ...
## $ PERIOD : num NA 9 NA NA NA 9 NA 0 9 0 ...
## $ AGE : num NA NA NA NA NA NA NA NA NA NA ...
## $ DONT_KNOW : chr NA NA NA NA ...
## $ SEEK_HELP : num NA NA NA NA NA NA NA NA NA NA ...
## $ TREATMENT : num NA NA NA NA NA NA NA 0 NA NA ...
## $ MEDS : num NA NA NA NA NA NA NA 0 NA NA ...
## $ PSYCHOTHERAPY : num NA NA NA NA NA NA NA 0 NA NA ...
## $ OTHER : num NA NA NA NA NA NA NA 0 NA NA ...
## $ SPECIFY : chr NA NA NA NA ...
## $ UNKNOWN : num NA NA NA NA NA NA NA 0 NA NA ...
## $ TAKING_MEDS : num NA NA NA NA NA NA NA 1 NA NA ...
## $ MEDICATION1 : chr NA NA NA NA ...
## $ STRENGTH1 : chr NA NA NA NA ...
## $ SEEN1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ SEEN1_SPEC : chr NA NA NA NA ...
## $ MEDICATION2 : chr NA NA NA NA ...
## $ STRENGTH2 : chr NA NA NA NA ...
## $ SEEN2 : num NA NA NA NA NA NA NA NA NA NA ...
## $ SEEN2_SPEC : chr NA NA NA NA ...
## $ MEDICATION3 : chr NA NA NA NA ...
## $ STRENGTH3 : chr NA NA NA NA ...
## $ SEEN3 : num NA NA NA NA NA NA NA NA NA NA ...
## $ SEEN3_SPEC : chr NA NA NA NA ...
## $ MEDICATION4 : chr NA NA NA NA ...
## $ STRENGTH4 : chr NA NA NA NA ...
## $ SEEN4 : num NA NA NA NA NA NA NA NA NA NA ...
## $ SEEN4_SPEC : chr NA NA NA NA ...
## $ MEDICATION5 : chr NA NA NA NA ...
## $ STRENGTH5 : chr NA NA NA NA ...
## $ SEEN5 : num NA NA NA NA NA NA NA NA NA NA ...
## $ SEEN5_SPEC : logi NA NA NA NA NA NA ...
## $ MEDICATION6 : chr NA NA NA NA ...
## $ STRENGTH6 : chr NA NA NA NA ...
## $ SEEN6 : num NA NA NA NA NA NA NA NA NA NA ...
## $ SEEN6_SPEC : logi NA NA NA NA NA NA ...
## $ MEDICATION7 : chr NA NA NA NA ...
## $ STRENGTH7 : chr NA NA NA NA ...
## $ SEEN7 : num NA NA NA NA NA NA NA NA NA NA ...
## $ SEEN7_SPEC : logi NA NA NA NA NA NA ...
## $ MEDICATION8 : chr NA NA NA NA ...
## $ STRENGTH8 : chr NA NA NA NA ...
## $ SEEN8 : num NA NA NA NA NA NA NA NA NA NA ...
## $ SEEN8_SPEC : logi NA NA NA NA NA NA ...
## $ MEDICATION9 : chr NA NA NA NA ...
## $ STRENGTH9 : chr NA NA NA NA ...
## $ SEEN9 : num NA NA NA NA NA NA NA NA NA NA ...
## $ SEEN9_SPEC : logi NA NA NA NA NA NA ...
## $ MEDICATION10 : chr NA NA NA NA ...
## $ STRENGTH10 : chr NA NA NA NA ...
## $ SEEN10 : num NA NA NA NA NA NA NA NA NA NA ...
## $ SEEN10_SPEC : logi NA NA NA NA NA NA ...
## $ MEDICATION11 : chr NA NA NA NA ...
## $ STRENGTH11 : chr NA NA NA NA ...
## $ SEEN11 : num NA NA NA NA NA NA NA NA NA NA ...
## $ SEEN11_SPEC : logi NA NA NA NA NA NA ...
## $ MEDICATION12 : chr NA NA NA NA ...
## $ STRENGTH12 : chr NA NA NA NA ...
## $ SEEN12 : num NA NA NA NA NA NA NA NA NA NA ...
## $ SEEN12_SPEC : logi NA NA NA NA NA NA ...
## $ MEDICATION13 : chr NA NA NA NA ...
## $ STRENGTH13 : chr NA NA NA NA ...
## $ SEEN13 : num NA NA NA NA NA NA NA NA NA NA ...
## $ SEEN13_SPEC : logi NA NA NA NA NA NA ...
## $ MEDICATION14 : chr NA NA NA NA ...
## $ STRENGTH14 : chr NA NA NA NA ...
## $ SEEN14 : num NA NA NA NA NA NA NA NA NA NA ...
## $ SEEN14_SPEC : logi NA NA NA NA NA NA ...
## $ MEDICATION15 : chr NA NA NA NA ...
## $ STRENGTH15 : logi NA NA NA NA NA NA ...
## $ SEEN15 : logi NA NA NA NA NA NA ...
## $ SEEN15_SPEC : logi NA NA NA NA NA NA ...
## $ MEDICATION16 : chr NA NA NA NA ...
## $ STRENGTH16 : logi NA NA NA NA NA NA ...
## $ SEEN16 : num NA NA NA NA NA NA NA NA NA NA ...
## $ SEEN16_SPEC : logi NA NA NA NA NA NA ...
## $ MEDICATION17 : chr NA NA NA NA ...
## $ STRENGTH17 : logi NA NA NA NA NA NA ...
## $ SEEN17 : num NA NA NA NA NA NA NA NA NA NA ...
## $ SEEN17_SPEC : logi NA NA NA NA NA NA ...
## $ MEDICATION18 : logi NA NA NA NA NA NA ...
## $ STRENGTH18 : logi NA NA NA NA NA NA ...
## $ SEEN18 : logi NA NA NA NA NA NA ...
## $ SEEN18_SPEC : logi NA NA NA NA NA NA ...
## $ MEDICATION19 : logi NA NA NA NA NA NA ...
## $ STRENGTH19 : logi NA NA NA NA NA NA ...
## $ SEEN19 : logi NA NA NA NA NA NA ...
## $ SEEN19_SPEC : logi NA NA NA NA NA NA ...
## $ MEDICATION20 : logi NA NA NA NA NA NA ...
## $ STRENGTH20 : logi NA NA NA NA NA NA ...
## $ SEEN20 : logi NA NA NA NA NA NA ...
## $ SEEN20_SPEC : logi NA NA NA NA NA NA ...
## $ NOTES : chr NA NA NA NA ...
## $ WARFARIN : num NA NA NA NA NA NA NA NA NA NA ...
## $ ASPIRIN : num NA NA NA NA NA NA NA NA NA NA ...
## $ ANTIPLATELETS : num NA NA NA NA NA NA NA NA NA NA ...
## $ DIURETICS : num NA NA NA NA NA NA NA NA NA NA ...
## $ ANTICONVULSANTS : num NA NA NA NA NA NA NA NA NA NA ...
## $ INSULIN : num NA NA NA NA NA NA NA NA NA NA ...
## $ HYPOGLYCEMICS : num NA NA NA NA NA NA NA NA NA NA ...
## $ SULFONYLUREA : num NA NA NA NA NA NA NA NA NA NA ...
## $ METFORMIN : num NA NA NA NA NA NA NA NA NA NA ...
## $ GLITAZONES : num NA NA NA NA NA NA NA NA NA NA ...
## $ DIGITALIS : num NA NA NA NA NA NA NA NA NA NA ...
## $ NITRATES : num NA NA NA NA NA NA NA NA NA NA ...
## $ CALCIUM_CHANNEL : num NA NA NA NA NA NA NA NA NA NA ...
## $ BETA_2_AGAONIST : num NA NA NA NA NA NA NA NA NA NA ...
## $ BETA_BLOCKERS : num NA NA NA NA NA NA NA NA NA NA ...
## $ ACE : num NA NA NA NA NA NA NA NA NA NA ...
## $ ANTI_ARRHYTHMICS : num NA NA NA NA NA NA NA NA NA NA ...
## $ ANTI_HYPERLIPIDEMICS : num NA NA NA NA NA NA NA NA NA NA ...
## $ STATIN_DRUG : num NA NA NA NA NA NA NA NA NA NA ...
## $ FIBRATE_DRUG : num NA NA NA NA NA NA NA NA NA NA ...
## $ THYROID : num NA NA NA NA NA NA NA NA NA NA ...
## $ ANTICHOLINERGICS : num NA NA NA NA NA NA NA NA NA NA ...
## $ LEVODOPA : num NA NA NA NA NA NA NA NA NA NA ...
## $ DOPAMINE : num NA NA NA NA NA NA NA NA NA NA ...
## $ ANTIDEPRESSANTS : num NA NA NA NA NA NA NA NA NA NA ...
## $ ANTIPSYCHOTICS : num NA NA NA NA NA NA NA NA NA NA ...
## $ ANXIOLYTICS : num NA NA NA NA NA NA NA NA NA NA ...
## $ CHOLINESTERASE : num NA NA NA NA NA NA NA NA NA NA ...
## $ RIVASTIGMINE : num NA NA NA NA NA NA NA NA NA NA ...
## $ TACRINE : num NA NA NA NA NA NA NA NA NA NA ...
## $ DONEPEZIL : num NA NA NA NA NA NA NA NA NA NA ...
## $ GALANTAMINE : num NA NA NA NA NA NA NA NA NA NA ...
## $ NMDA : num NA NA NA NA NA NA NA NA NA NA ...
## $ MEMANTINE : num NA NA NA NA NA NA NA NA NA NA ...
## $ ALPHA_BLOCKERS : num NA NA NA NA NA NA NA NA NA NA ...
## $ HYPNOTICS : num NA NA NA NA NA NA NA NA NA NA ...
## $ H1_BLOCKERS : num NA NA NA NA NA NA NA NA NA NA ...
## $ H2_BLOCKERS : num NA NA NA NA NA NA NA NA NA NA ...
## $ NSAID : num NA NA NA NA NA NA NA NA NA NA ...
## $ COX2 : num NA NA NA NA NA NA NA NA NA NA ...
## $ NARCOTICS : num NA NA NA NA NA NA NA NA NA NA ...
## $ HYDERGINE : num NA NA NA NA NA NA NA NA NA NA ...
## $ DEPRENYL : num NA NA NA NA NA NA NA NA NA NA ...
## $ ESTROGEN_SUPP : num NA NA NA NA NA NA NA NA NA NA ...
## $ PRESCRIPTION : num NA NA NA NA NA NA NA NA NA NA ...
## $ OTC : num NA NA NA NA NA NA NA NA NA NA ...
## $ STEROIDS : num NA NA NA NA NA NA NA NA NA NA ...
## $ OTHER_MEDS : num NA NA NA NA NA NA NA NA NA NA ...
## $ SPEC_MEDS : chr NA NA NA NA ...
## $ MULTIVITAMINS : num NA NA NA NA NA NA NA NA NA NA ...
## $ VITAMIN_C : num NA NA NA NA NA NA NA NA NA NA ...
## $ VITAMIN_E : num NA NA NA NA NA NA NA NA NA NA ...
## $ VITAMINE_B12 : num NA NA NA NA NA NA NA NA NA NA ...
## $ COENZYME_Q : num NA NA NA NA NA NA NA NA NA NA ...
## $ DHA : num NA NA NA NA NA NA NA NA NA NA ...
## $ LECITHIN : num NA NA NA NA NA NA NA NA NA NA ...
## $ GINKGO : num NA NA NA NA NA NA NA NA NA NA ...
## $ FOLIC_ACID : num NA NA NA NA NA NA NA NA NA NA ...
## $ VITAMIN_B6 : num NA NA NA NA NA NA NA NA NA NA ...
## $ VITAMIN_D : num NA NA NA NA NA NA NA NA NA NA ...
## $ OMEGA3 : num NA NA NA NA NA NA NA NA NA NA ...
## $ MEDCOND_COMENTS : logi NA NA NA NA NA NA ...
dfDD <- read_excel(revisedDDpath, sheet = "AAAD_MEDCON")
## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)] ## 33 vars
## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 33 × 2
## VarNames `Data Type`
## <chr> <chr>
## 1 REFCTR VARCHAR2(6)
## 2 REVIEW_DATE date
## 3 REVIEWER VARCHAR
## 4 SEEN5_SPEC VARCHAR2(100)
## 5 SEEN6_SPEC VARCHAR2(100)
## 6 SEEN7_SPEC VARCHAR2(100)
## 7 SEEN8_SPEC VARCHAR2(100)
## 8 SEEN9_SPEC VARCHAR2(100)
## 9 SEEN10_SPEC VARCHAR2(100)
## 10 SEEN11_SPEC VARCHAR2(100)
## # ℹ 23 more rows
## select the vars to be converted to numeric
convert2num <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("NUMBER", dfDD$`Data Type`)] ## "SEEN15" "SEEN18" "SEEN19" "SEEN20"
## select the vars to be converted to date
convert2date <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date",dfDD$`Data Type`,ignore.case = T)] ## REVIEW_DATE
## the rest should be converted to character
convert2chr <- setdiff(logicols,c(convert2num,convert2date)) ## 28 vars
## convert
df[convert2num] <- lapply(df[convert2num], as.numeric)
df[convert2date] <- lapply(df[convert2date], as.Date)
df[convert2chr] <- lapply(df[convert2chr], as.character)
## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "POSIXct" "POSIXt"
##
## [[4]]
## [1] "Date"
## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE" "DATE_OF_BIRTH" "DATE_OF_ONSET"
## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]
## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## "REVIEW_DATE, ignore it, since it has been converted in previous step
## [1] "REVIEW_DATE"
head(df[,datecols])
## EXAM_DATE DATE_OF_BIRTH DATE_OF_ONSET
## 1 2016-09-07 1925-08-21 <NA>
## 2 2020-03-06 1928-02-14 <NA>
## 3 2020-03-06 1945-09-01 <NA>
## 4 2020-03-06 1944-03-22 <NA>
## 5 2020-03-06 1944-10-17 <NA>
## 6 2020-03-05 1955-06-26 2016-09-09
## convert format
df[datecols] <- lapply(df[datecols], as.Date)
## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric" "character" "Date"
## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)]
## 81 vars
## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]
mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## DOA_UNK, ignore, I have updated DD to "char"
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)
## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## All values are within valid ranges.
## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 31 vars
## extract numeric variables from DD
## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]
mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)
unique(numColsfromDD$`Valid Responses`)
## [1] NA "1 thru 99999;"
## [3] "1 thru 9999;" "0;\r\n1;"
## [5] "0;\r\n1;\r\n9;\r\n-1;" "0;\r\n1;\r\n7;\r\n8;\r\n9;"
## [7] "0;\r\n1;\r\n9;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP and IND
AAAD_MEDCON <- df
df <- AAAD_SOCIO_DEMO
info(AAAD_SOCIO_DEMO,"SYSIND")
## #obs:402, cols:161, inds:391
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "logical"
##
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
## 'data.frame': 402 obs. of 161 variables:
## $ SYSXM : num 7895153 7875263 7879973 7879993 7880213 ...
## $ SYSIND : num 11218613 11036843 11041143 11041043 11005233 ...
## $ SYSGP : num 7928123 7893863 7894373 7894373 7888553 ...
## $ SYSGPSTUDY : num 1366233 1309183 1309693 1309693 1303893 ...
## $ SYSINDGP : num 7981883 7792583 7797003 7796903 7760033 ...
## $ CGI_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ GPS_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ STDCGI_ORDER : num 11 11 11 11 11 11 11 11 11 11 ...
## $ LSTUDY : chr "ADCRLPRADI" "ADCRLPRADI" "ADFAMPRADI" "ADFAMPRADI" ...
## $ DB_OWNER : chr "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
## $ STUDY : chr "ALZ" "ALZ" "ALZ" "ALZ" ...
## $ SUBSTUDY : chr "ADCRLPRADI" "ADCRLPRADI" "ADFAMPRADI" "ADFAMPRADI" ...
## $ CENTER : chr "IHG" "IHG" "IHG" "IHG" ...
## $ GP : num 87998 87598 87502 87502 87501 ...
## $ IND : num 1 1 102 100 1 1 1 1 1 1 ...
## $ REFCTR : logi NA NA NA NA NA NA ...
## $ EXAM_DATE : POSIXct, format: "2021-02-01" "2020-07-14" ...
## $ EXAMINER : chr "sjt82" "v.rodriguez4" "prm72" "prm72" ...
## $ DATE_OF_BIRTH: POSIXct, format: "1943-09-22" "1946-10-04" ...
## $ AGE_AT_EXAM : num 77 73 69 71 81 84 85 77 83 85 ...
## $ REVIEW_DATE : logi NA NA NA NA NA NA ...
## $ REVIEWER : logi NA NA NA NA NA NA ...
## $ SDF1 : num 0 NA NA NA NA NA NA NA NA NA ...
## $ SDF2 : num 1 NA NA NA NA NA NA NA NA NA ...
## $ SDF2A : chr "WH" NA NA NA ...
## $ SDF3 : num 0 NA NA NA NA NA NA NA NA NA ...
## $ SDF3A : chr "SPANISH" NA NA NA ...
## $ SDF4 : chr "W" NA NA NA ...
## $ SDF4A : chr NA NA NA NA ...
## $ SDF5 : chr "A" NA NA NA ...
## $ SDF5A : chr NA NA NA NA ...
## $ SDF6 : chr "SF" NA NA NA ...
## $ SDF6A : chr NA NA NA NA ...
## $ SDF7 : chr NA NA NA NA ...
## $ SDF8A : chr "PT" NA NA NA ...
## $ SDF8B : chr NA NA NA NA ...
## $ SDF8C : chr NA NA NA NA ...
## $ SDF9 : chr NA NA NA NA ...
## $ SDF10 : chr NA NA NA NA ...
## $ SDF11 : chr NA NA NA NA ...
## $ SDF12 : logi NA NA NA NA NA NA ...
## $ SDF13 : logi NA NA NA NA NA NA ...
## $ SDF14 : logi NA NA NA NA NA NA ...
## $ SDF15 : logi NA NA NA NA NA NA ...
## $ SDF16 : logi NA NA NA NA NA NA ...
## $ SDF17 : logi NA NA NA NA NA NA ...
## $ SDF17A : logi NA NA NA NA NA NA ...
## $ SDF18 : logi NA NA NA NA NA NA ...
## $ SDF19 : logi NA NA NA NA NA NA ...
## $ SDF20 : chr "TEACHER ASSISTANT" "UNKNOWN" "Computer service tech" "Engineer" ...
## $ SDF21 : chr NA NA NA NA ...
## $ SDF22 : logi NA NA NA NA NA NA ...
## $ SDF22A : chr NA NA NA NA ...
## $ SDF23 : logi NA NA NA NA NA NA ...
## $ SDF24 : logi NA NA NA NA NA NA ...
## $ SDF25 : logi NA NA NA NA NA NA ...
## $ SDF26 : logi NA NA NA NA NA NA ...
## $ SDF27A : logi NA NA NA NA NA NA ...
## $ SDF27B : logi NA NA NA NA NA NA ...
## $ SDF27C : logi NA NA NA NA NA NA ...
## $ SDF27D : logi NA NA NA NA NA NA ...
## $ SDF27E : logi NA NA NA NA NA NA ...
## $ SDF27F : logi NA NA NA NA NA NA ...
## $ SDF27G : logi NA NA NA NA NA NA ...
## $ SDF27H : logi NA NA NA NA NA NA ...
## $ SDF28 : logi NA NA NA NA NA NA ...
## $ SDF29 : logi NA NA NA NA NA NA ...
## $ SDF30A : logi NA NA NA NA NA NA ...
## $ SDF30B : logi NA NA NA NA NA NA ...
## $ SDF30C : logi NA NA NA NA NA NA ...
## $ SDF30D : logi NA NA NA NA NA NA ...
## $ SDF30E : logi NA NA NA NA NA NA ...
## $ SDF30F : logi NA NA NA NA NA NA ...
## $ SDF30G : logi NA NA NA NA NA NA ...
## $ SDF31 : logi NA NA NA NA NA NA ...
## $ SDF31A : logi NA NA NA NA NA NA ...
## $ SDF32 : logi NA NA NA NA NA NA ...
## $ SDF33 : logi NA NA NA NA NA NA ...
## $ SDF33A : logi NA NA NA NA NA NA ...
## $ SDF34 : logi NA NA NA NA NA NA ...
## $ SDF35 : logi NA NA NA NA NA NA ...
## $ SDF36 : logi NA NA NA NA NA NA ...
## $ SDF37 : logi NA NA NA NA NA NA ...
## $ SDF38 : logi NA NA NA NA NA NA ...
## $ SDF39 : logi NA NA NA NA NA NA ...
## $ SDF40 : logi NA NA NA NA NA NA ...
## $ SDF41 : logi NA NA NA NA NA NA ...
## $ SDF42 : logi NA NA NA NA NA NA ...
## $ SDF42A : logi NA NA NA NA NA NA ...
## $ SDF42B : logi NA NA NA NA NA NA ...
## $ SDF43A : logi NA NA NA NA NA NA ...
## $ SDF43A1 : logi NA NA NA NA NA NA ...
## $ SDF43B : logi NA NA NA NA NA NA ...
## $ SDF43B1 : logi NA NA NA NA NA NA ...
## $ SDF43C : logi NA NA NA NA NA NA ...
## $ SDF43C1 : logi NA NA NA NA NA NA ...
## $ SDF44 : logi NA NA NA NA NA NA ...
## $ SDF44A : logi NA NA NA NA NA NA ...
## $ SDF45A : logi NA NA NA NA NA NA ...
## $ SDF45A1 : logi NA NA NA NA NA NA ...
## $ SDF45B : logi NA NA NA NA NA NA ...
## $ SDF45B1 : logi NA NA NA NA NA NA ...
## $ SDF46 : logi NA NA NA NA NA NA ...
## $ SDF47 : logi NA NA NA NA NA NA ...
## $ SDF48 : logi NA NA NA NA NA NA ...
## $ SDF49A : logi NA NA NA NA NA NA ...
## $ SDF49B : logi NA NA NA NA NA NA ...
## $ SDF49C : logi NA NA NA NA NA NA ...
## $ SDF49D : logi NA NA NA NA NA NA ...
## $ SDF50A : logi NA NA NA NA NA NA ...
## $ SDF50B : logi NA NA NA NA NA NA ...
## $ SDF50C : logi NA NA NA NA NA NA ...
## $ SDF50D : logi NA NA NA NA NA NA ...
## $ SDF51 : logi NA NA NA NA NA NA ...
## $ SDF51A : logi NA NA NA NA NA NA ...
## $ SDF52A : logi NA NA NA NA NA NA ...
## $ SDF52B : logi NA NA NA NA NA NA ...
## $ SDF53A : logi NA NA NA NA NA NA ...
## $ SDF53A1 : logi NA NA NA NA NA NA ...
## $ SDF53B : logi NA NA NA NA NA NA ...
## $ SDF53C : logi NA NA NA NA NA NA ...
## $ SDF53C1 : logi NA NA NA NA NA NA ...
## $ SDF54 : logi NA NA NA NA NA NA ...
## $ SDF55 : logi NA NA NA NA NA NA ...
## $ SDF56 : logi NA NA NA NA NA NA ...
## $ SDF57A : logi NA NA NA NA NA NA ...
## $ SDF57B : logi NA NA NA NA NA NA ...
## $ SDF57C : logi NA NA NA NA NA NA ...
## $ SDF57D : logi NA NA NA NA NA NA ...
## $ SDF58A : logi NA NA NA NA NA NA ...
## $ SDF58B : logi NA NA NA NA NA NA ...
## $ SDF58C : logi NA NA NA NA NA NA ...
## $ SDF58D : logi NA NA NA NA NA NA ...
## $ SDF59 : logi NA NA NA NA NA NA ...
## $ SDF59A : logi NA NA NA NA NA NA ...
## $ SDF60A : logi NA NA NA NA NA NA ...
## $ SDF60B : logi NA NA NA NA NA NA ...
## $ SDF60C : logi NA NA NA NA NA NA ...
## $ SDF60D : logi NA NA NA NA NA NA ...
## $ SDF60E : logi NA NA NA NA NA NA ...
## $ SDF60F : logi NA NA NA NA NA NA ...
## $ SDF60FS : logi NA NA NA NA NA NA ...
## $ SDF60G : logi NA NA NA NA NA NA ...
## $ SDF60GS : logi NA NA NA NA NA NA ...
## $ SDF61A : logi NA NA NA NA NA NA ...
## $ SDF61A1 : logi NA NA NA NA NA NA ...
## $ SDF61B : logi NA NA NA NA NA NA ...
## $ SDF61C : logi NA NA NA NA NA NA ...
## $ SDF61C1 : logi NA NA NA NA NA NA ...
## $ SDF62A : logi NA NA NA NA NA NA ...
## $ SDF62B : logi NA NA NA NA NA NA ...
## $ SDF62C : logi NA NA NA NA NA NA ...
## $ SDF62D : logi NA NA NA NA NA NA ...
## $ SDF63A : logi NA NA NA NA NA NA ...
## $ SDF63B : logi NA NA NA NA NA NA ...
## $ SDF63C : logi NA NA NA NA NA NA ...
## $ SDF63D : logi NA NA NA NA NA NA ...
## $ SDF64 : logi NA NA NA NA NA NA ...
## $ SDF65 : logi NA NA NA NA NA NA ...
## $ SDF65A : logi NA NA NA NA NA NA ...
## $ SDF66 : logi NA NA NA NA NA NA ...
dfDD <- read_excel(revisedDDpath, sheet = "AAAD_SOCIO_DEMO")
## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)] ## 121
## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 121 × 2
## VarNames `Data Type`
## <chr> <chr>
## 1 REFCTR VARCHAR2(6)
## 2 REVIEW_DATE date
## 3 REVIEWER VARCHAR
## 4 SDF12 VARCHAR2(200)
## 5 SDF13 NUMBER(3)
## 6 SDF14 CHAR(2)
## 7 SDF15 CHAR(2)
## 8 SDF16 CHAR(2)
## 9 SDF17 NUMBER(2)
## 10 SDF17A CHAR(2)
## # ℹ 111 more rows
## select the vars to be converted to numeric
convert2num <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("NUMBER", dfDD$`Data Type`)] ## 55 vars
## select the vars to be converted to date
convert2date <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date",dfDD$`Data Type`,ignore.case = T)]
## [1] "REVIEW_DATE"
## the rest should be converted to character
convert2chr <- setdiff(logicols,c(convert2num,convert2date)) ## 65 vars
## convert
df[convert2num] <- lapply(df[convert2num], as.numeric)
df[convert2date] <- lapply(df[convert2date], as.Date)
df[convert2chr] <- lapply(df[convert2chr], as.character)
## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "POSIXct" "POSIXt"
##
## [[4]]
## [1] "Date"
## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE" "DATE_OF_BIRTH"
## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]
## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## REVIEW_DATE, can ignore, since it has been converted in last step
## [1] "REVIEW_DATE"
head(df[,datecols])
## EXAM_DATE DATE_OF_BIRTH
## 1 2021-02-01 1943-09-22
## 2 2020-07-14 1946-10-04
## 3 2020-09-16 1950-10-02
## 4 2020-09-16 1949-04-30
## 5 2019-05-22 1937-10-24
## 6 2020-09-17 1935-10-25
## convert format
df[datecols] <- lapply(df[datecols], as.Date)
## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric" "character" "Date"
## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 89 vars
## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]
mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)
## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
unique(df[["SDF8A"]]) ## DD: 8. Are you working now? Circle all that apply
## [1] "PT" NA "FT" "V" "FT PT"
unique(df[["SDF8B"]]) ## DD: If participant says NO, ask Why not? If any of the following SKIP TO #20
## [1] NA "O" "FT" "R" "PD" "FT PD" "R O" "U R"
## [9] "S" "U" "IS" "T" "T R" "R PD"
## NOTE: these two variables are good, as they being marked "Multiple" in the [Single, Multiple or Calculated Values] column of DD
## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 69 vars
## extract numeric variables from DD
## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]
mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)
unique(numColsfromDD$`Valid Responses`)
## [1] NA "1 thru 99999;"
## [3] "1 thru 9999;" "1;\r\n0;"
## [5] "1 thru 145;" "1 thru 31;"
## [7] "-2;" "0 thru 8;"
## [9] "1;\r\n2;\r\n3;\r\n4;\r\n-2;" "0 thru 100;"
## [11] "0 thru 5;" "0;\r\n1;\r\n-2;"
## [13] "1;\r\n0;\r\n-2;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP
AAAD_SOCIO_DEMO <- df
df <- AAAD_TRAILS
info(AAAD_TRAILS,"SYSIND")
## #obs:439, cols:34, inds:428
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "logical"
##
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
## 'data.frame': 439 obs. of 34 variables:
## $ SYSXM : num 7670123 7650923 7651273 7659813 7660113 ...
## $ SYSIND : num 11221133 11218963 11219583 11036793 11221813 ...
## $ SYSGP : num 7929223 7928203 7928153 7893833 7929683 ...
## $ SYSGPSTUDY : num 1367333 1366313 1366263 1309153 1367793 ...
## $ SYSINDGP : num 7984403 7982233 7982853 7792533 7985083 ...
## $ CGI_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ GPS_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ STDCGI_ORDER : num 11 11 11 11 11 11 11 11 11 11 ...
## $ LSTUDY : chr "ADCRLPRADI" "ADCONTROL" "ADFAMPRADI" "ADCRLPRADI" ...
## $ DB_OWNER : chr "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
## $ STUDY : chr "ALZ" "ALZ" "ALZ" "ALZ" ...
## $ SUBSTUDY : chr "ADCRLPRADI" "ADCONTROL" "ADFAMPRADI" "ADCRLPRADI" ...
## $ CENTER : chr "IHG" "IHG" "IHG" "IHG" ...
## $ GP : num 88063 88095 88002 87595 88059 ...
## $ IND : num 1 1 100 9000 1 1 1 1 100 1 ...
## $ REFCTR : logi NA NA NA NA NA NA ...
## $ EXAM_DATE : POSIXct, format: "2019-03-07" "2019-03-08" ...
## $ EXAMINER : chr "bxf258" "sjt82" "v.rodriguez4" "v.rodriguez4" ...
## $ DATE_OF_BIRTH: POSIXct, format: "1928-01-11" "1941-05-02" ...
## $ AGE_AT_EXAM : num 91 77 68 62 84 84 77 72 71 65 ...
## $ REVIEW_DATE : logi NA NA NA NA NA NA ...
## $ REVIEWER : logi NA NA NA NA NA NA ...
## $ TIME_A : num NA 70 56 93 71 71 NA 30 275 60 ...
## $ TIME_AMISS : num -1 NA NA NA NA NA NA NA NA NA ...
## $ ERR_A : num 0 0 2 1 0 0 1 0 1 0 ...
## $ ERR_AMISS : num NA NA NA NA NA NA NA NA NA NA ...
## $ COR_A : num 24 24 22 23 24 24 23 24 24 24 ...
## $ COR_AMISS : num NA NA NA NA NA NA NA NA NA NA ...
## $ TIME_B : num NA 225 124 109 240 240 NA 71 NA 90 ...
## $ TIME_BMISS : num -1 NA NA NA NA NA NA NA -2 NA ...
## $ ERR_B : num NA 4 0 0 0 0 NA 0 NA 0 ...
## $ ERR_BMISS : num -1 NA NA NA NA NA NA NA -2 NA ...
## $ COR_B : num NA 20 24 24 24 24 NA 24 NA 24 ...
## $ COR_BMISS : num -1 NA NA NA NA NA NA NA -2 NA ...
dfDD <- read_excel(revisedDDpath, sheet = "AAAD_TRAILS")
## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)] ## 3
## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 3 × 2
## VarNames `Data Type`
## <chr> <chr>
## 1 REFCTR VARCHAR2(6)
## 2 REVIEW_DATE date
## 3 REVIEWER VARCHAR
## select the vars to be converted to date
convert2date <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date",dfDD$`Data Type`,ignore.case = T)]
## [1] "REVIEW_DATE"
## the rest should be converted to character
convert2chr <- setdiff(logicols,c(convert2num,convert2date)) ## 2 vars
## convert
df[convert2date] <- lapply(df[convert2date], as.Date)
df[convert2chr] <- lapply(df[convert2chr], as.character)
## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "POSIXct" "POSIXt"
##
## [[4]]
## [1] "Date"
## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE" "DATE_OF_BIRTH"
## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]
## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## REVIEW_DATE, can ignore, since it has been converted in previous step
## [1] "REVIEW_DATE"
head(df[,datecols])
## EXAM_DATE DATE_OF_BIRTH
## 1 2019-03-07 1928-01-11
## 2 2019-03-08 1941-05-02
## 3 2019-03-04 1950-03-25
## 4 2019-03-08 1956-10-19
## 5 2019-03-07 1934-11-07
## 6 2019-03-07 1934-12-03
## convert format
df[datecols] <- lapply(df[datecols], as.Date)
## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric" "character" "Date"
## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 8 vars
## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]
mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)
## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## All values are within valid ranges.
## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 23 vars
## extract numeric variables from DD
## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]
mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)
unique(numColsfromDD$`Valid Responses`)
## [1] NA "1 thru 99999;" "1 thru 9999;"
## [4] "0 thru 150;" "-1;\r\n-2;\r\n-3;" "0 thru 40;"
## [7] "0 thru 24;" "0 thru 300;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP and IND
## need to contact Mike about other variables: TIME_A, TIME_B, COR_B
AAAD_TRAILS <- df
df <- ALZ_B9_JUDGE_RC
info(ALZ_B9_JUDGE_RC,"SYSIND")
## #obs:483, cols:82, inds:481
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "logical"
##
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
## 'data.frame': 483 obs. of 82 variables:
## $ SYSXM : num 8276003 8276013 8258753 8259063 8277553 ...
## $ SYSIND : num 11620433 11160523 11034403 11369813 11620763 ...
## $ SYSGP : num 8005513 7923793 7888823 7952013 8005723 ...
## $ SYSGPSTUDY : num 1452223 1361903 1304163 1397123 1452433 ...
## $ SYSINDGP : num 8389503 7923633 7790023 8139083 8389833 ...
## $ CGI_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ GPS_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ STDCGI_ORDER : num 11 11 11 11 11 11 11 11 11 11 ...
## $ LSTUDY : chr "ADCONTROL" "ADCRLPRADI" "ADFAMPRADI" "ADCRLPRADI" ...
## $ DB_OWNER : chr "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
## $ STUDY : chr "ALZ" "ALZ" "ALZ" "ALZ" ...
## $ SUBSTUDY : chr "ADCONTROL" "ADCRLPRADI" "ADFAMPRADI" "ADCRLPRADI" ...
## $ CENTER : chr "IHG" "IHG" "IHG" "IHG" ...
## $ GP : num 104507 87883 87556 88301 104457 ...
## $ IND : num 1 1 9001 1 1 ...
## $ REFCTR : logi NA NA NA NA NA NA ...
## $ EXAM_DATE : POSIXct, format: "2023-08-09" "2024-02-14" ...
## $ EXAMINER : chr "jjs2031" "gsv32" "jjs2031" "jjs2031" ...
## $ DATE_OF_BIRTH : POSIXct, format: "1944-06-21" "1939-03-20" ...
## $ AGE_AT_EXAM : num 79 84 68 76 76 81 86 73 86 66 ...
## $ REVIEW_DATE : logi NA NA NA NA NA NA ...
## $ REVIEWER : logi NA NA NA NA NA NA ...
## $ MEMORY_DECLINE : num 0 1 0 0 1 0 1 0 0 1 ...
## $ COP_RPT_MEMDECLINE : num 8 1 0 0 8 8 0 0 0 1 ...
## $ MEANINGFUL_IMP : num 0 1 0 0 1 0 1 0 0 1 ...
## $ IMP_MEMORY : num NA 1 NA NA 1 NA 1 NA NA 1 ...
## $ IMP_ORIENTATION : num NA 0 NA NA 1 NA 0 NA NA 1 ...
## $ IMP_EXEC_FUNC : num NA 0 NA NA 1 NA 0 NA NA 1 ...
## $ IMP_LANGUAGE : num NA 0 NA NA 0 NA 0 NA NA 0 ...
## $ IMP_VISUOSPATIAL : num NA 0 NA NA 0 NA 0 NA NA 0 ...
## $ IMP_ATTENTION : num NA 0 NA NA 0 NA 0 NA NA 1 ...
## $ IMP_FLUCTUATING_COG : num NA 0 NA NA 0 NA 0 NA NA 0 ...
## $ IMP_FLUCTUATING_AGE : num NA NA NA NA NA NA NA NA NA NA ...
## $ IMP_OTHER : num NA 0 NA NA 0 NA 0 NA NA 0 ...
## $ IMP_OTH_SPECIFY : chr NA NA NA NA ...
## $ IMP_PREDOMINANT_SYMP : num NA 1 NA NA 1 NA 1 NA NA 1 ...
## $ IMP_PRED_SYMP_OTH : chr NA NA NA NA ...
## $ IMP_MODE_ONSET : num NA 1 NA NA 1 NA 1 NA NA 1 ...
## $ MODE_ONSET6A : logi NA NA NA NA NA NA ...
## $ BEGIN_AGE : num NA 83 NA NA 76 NA 86 NA NA 63 ...
## $ BEHAV_SYMPTOMS : num 0 1 0 0 0 0 0 0 0 1 ...
## $ BS_APATHY : num NA 0 NA NA NA NA NA NA NA 0 ...
## $ BS_DEPRESSED : num NA 1 NA NA NA NA NA NA NA 1 ...
## $ BS_VISUAL_HAL : num NA 1 NA NA NA NA NA NA NA 0 ...
## $ HAL_WELL_INFORMED : num NA 1 NA NA NA NA NA NA NA NA ...
## $ HAL_BEGIN_AGE : num NA 83 NA NA NA NA NA NA NA NA ...
## $ AUDITORY_HAL : num NA 0 NA NA NA NA NA NA NA 0 ...
## $ ABN_BELIEFS : num NA 0 NA NA NA NA NA NA NA 0 ...
## $ BS_DISINIBITION : num NA 0 NA NA NA NA NA NA NA 0 ...
## $ BS_IRRITABILITY : num NA 0 NA NA NA NA NA NA NA 0 ...
## $ BS_AGITATION : num NA 0 NA NA NA NA NA NA NA 0 ...
## $ BS_PERSONAL_CHG : num NA 0 NA NA NA NA NA NA NA 0 ...
## $ BS_REM : num NA 0 NA NA NA NA NA NA NA 0 ...
## $ REM_BEGIN_AGE : num NA NA NA NA NA NA NA NA NA NA ...
## $ BS_ANXIETY : num NA 1 NA NA NA NA NA NA NA 0 ...
## $ BS_OTHER : num NA 0 NA NA NA NA NA NA NA 0 ...
## $ BS_OTHER_SPEC : chr NA NA NA NA ...
## $ BS_PREDOMINANT_SYMP : num NA 2 NA NA NA NA NA NA NA 2 ...
## $ BS_PRED_SYMP_OTH : chr NA NA NA NA ...
## $ BS_MODE_ONSET : num NA 1 NA NA NA NA NA NA NA 2 ...
## $ BS_MODE_ONSET_OTH : chr NA NA NA NA ...
## $ BS_BEGIN_AGE : num NA 74 NA NA NA NA NA NA NA 63 ...
## $ MOTOR_SYPTOMS : num 0 0 0 0 0 0 0 0 0 0 ...
## $ MS_GAIT1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ MS_FALLS1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ MS_TREMOR1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ MS_SLOWNESS1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ MS_PRED_SYMPTOM : num NA NA NA NA NA NA NA NA NA NA ...
## $ MS_MODE_ONSET : num NA NA NA NA NA NA NA NA NA NA ...
## $ MS_MODE_ONSET_OTH : chr NA NA NA NA ...
## $ MS_PARKINSONISM : num NA NA NA NA NA NA NA NA NA NA ...
## $ PARK_BEGIN_AGE : num NA NA NA NA NA NA NA NA NA NA ...
## $ MS_ALS : num NA NA NA NA NA NA NA NA NA NA ...
## $ MS_ALS_BEGIN_AGE : logi NA NA NA NA NA NA ...
## $ MS_BEGIN_AGE : num NA NA NA NA NA NA NA NA NA NA ...
## $ OVERALL_COURSE_DEC : num 8 1 8 8 1 8 8 8 8 1 ...
## $ PRED_DOMAIN : num 8 2 8 8 1 8 8 8 8 1 ...
## $ LBD_CANDIDATE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FLD_CANDIDATE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ NOTES_B9JUDGE : chr NA NA NA NA ...
## $ TOTALSCORE_B9_Q9 : num 0 3 0 0 0 0 0 0 0 1 ...
## $ TOTALSCORE_B9_Q9_STATUS: chr NA NA NA NA ...
dfDD <- read_excel(revisedDDpath, sheet = "ALZ_B9_JUDGE_RC")
## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]
## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 5 × 2
## VarNames `Data Type`
## <chr> <chr>
## 1 REFCTR VARCHAR2(6)
## 2 REVIEW_DATE date
## 3 REVIEWER VARCHAR
## 4 MODE_ONSET6A VARCHAR2(100)
## 5 MS_ALS_BEGIN_AGE NUMBER(3)
## select the vars to be converted to numeric
convert2num <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("NUMBER", dfDD$`Data Type`)] ## MS_ALS_BEGIN_AGE
## select the vars to be converted to date
convert2date <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date",dfDD$`Data Type`,ignore.case = T)] ## REVIEW_DATE
## the rest should be converted to character
convert2chr <- setdiff(logicols,c(convert2num,convert2date)) ## [1] "REFCTR" "REVIEWER" "MODE_ONSET6A"
## convert
df[convert2num] <- lapply(df[convert2num], as.numeric)
df[convert2date] <- lapply(df[convert2date], as.Date)
df[convert2chr] <- lapply(df[convert2chr], as.character)
## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "POSIXct" "POSIXt"
##
## [[4]]
## [1] "Date"
## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE" "DATE_OF_BIRTH"
## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]
## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## REVIEW_DATE, ignore it, as it has been corrected in previous step
## [1] "REVIEW_DATE"
head(df[,datecols])
## EXAM_DATE DATE_OF_BIRTH
## 1 2023-08-09 1944-06-21
## 2 2024-02-14 1939-03-20
## 3 2023-06-22 1954-08-20
## 4 2024-02-13 1947-05-13
## 5 2023-04-17 1946-12-19
## 6 2024-02-15 1942-09-30
## convert format
df[datecols] <- lapply(df[datecols], as.Date)
## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric" "character" "Date"
## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 17 vars
## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]
mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)
## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## All values are within valid ranges.
## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 62 vars
## extract numeric variables from DD
## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]
mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)
unique(numColsfromDD$`Valid Responses`)
## [1] NA
## [2] "1 thru 99999;"
## [3] "1 thru 9999;"
## [4] "0;\r\n1;\r\n8;"
## [5] "0;\r\n1;"
## [6] "0;\r\n1;\r\n9;"
## [7] "15 thru 110;"
## [8] "1;\r\n2;\r\n3;\r\n4;\r\n5;\r\n6;\r\n7;\r\n8;\r\n99;"
## [9] "1;\r\n2;\r\n3;\r\n4;\r\n99;"
## [10] "15 through 110;"
## [11] "15 through 110;\r\n888;"
## [12] "1;\r\n2;\r\n3;\r\n4;\r\n5;\r\n6;\r\n7;\r\n8;\r\n9;\r\n10;\r\n99;"
## [13] "1;\r\n2;\r\n3;\r\n4;\r\n5;\r\n8;\r\n9;"
## [14] "1;\r\n2;\r\n3;\r\n8;\r\n9;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP
ALZ_B9_JUDGE_RC<- df
df <- ALZ_CLINICALSUM
info(ALZ_CLINICALSUM,"SYSIND")
## #obs:1484, cols:39, inds:1480
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "logical"
##
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
## 'data.frame': 1484 obs. of 39 variables:
## $ SYSXM : num 8063903 8066823 8067393 8065353 8058883 ...
## $ SYSIND : num 11493593 11493813 11493613 11493363 11493633 ...
## $ SYSGP : num 7946353 7946353 7946353 7946353 7946353 ...
## $ SYSGPSTUDY : num 1387463 1387463 1387463 1387463 1387463 ...
## $ SYSINDGP : num 8262663 8262883 8262683 8262433 8262703 ...
## $ CGI_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ GPS_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ STDCGI_ORDER : num 11 11 11 11 11 11 11 11 11 11 ...
## $ LSTUDY : chr "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" ...
## $ DB_OWNER : chr "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
## $ STUDY : chr "ALZ" "ALZ" "ALZ" "ALZ" ...
## $ SUBSTUDY : chr "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" ...
## $ CENTER : chr "IHG" "IHG" "IHG" "IHG" ...
## $ GP : num 87545 87545 87545 87545 87545 ...
## $ IND : num 9026 1024 144 124 148 ...
## $ REFCTR : logi NA NA NA NA NA NA ...
## $ DATE_OF_BIRTH : POSIXct, format: "1973-01-14" "1941-04-03" ...
## $ LAST_CONTACT_DATE: logi NA NA NA NA NA NA ...
## $ LAST_CONTACT_AGE : logi NA NA NA NA NA NA ...
## $ AGE_OF_DEATH : num NA NA NA NA NA NA NA NA NA NA ...
## $ AGE_OF_EXAM : num 49 80 67 77 64 75 78 61 71 82 ...
## $ IMPRESSION : chr "Affected By Exam" "Affected By Exam" "Affected By Exam" "Affected By Exam" ...
## $ AD_CATEGORY : chr "No Data" "No Data" "No Data" "Definite AD (Exam)" ...
## $ AGE_OF_ONSET : num 43 79 65 57 61 73 71 60 65 75 ...
## $ AOO_DOC_EST_UNK : chr "E" "E" "E" "E" ...
## $ AGE_OF_DIAGNOSIS : num NA 79 65 57 64 NA 73 NA 65 75 ...
## $ AODX_UNKNOWN : chr "U" NA NA NA ...
## $ AD_HX_CATEGORY : chr NA NA NA NA ...
## $ UNCLEAR_CATEGORY : chr NA NA NA NA ...
## $ DEMENT_NAME : chr NA NA NA NA ...
## $ CLINICAL_EXAMINER: chr "katrina/DR. VANCE" "JOSE" "JOSE" "Jose Sanchez" ...
## $ FOLLOW_UP : chr "N" "N" "N" "N" ...
## $ AUTOPSY_DISCUSSED: chr "ND" "N" "ND" "Y" ...
## $ AUTOPSY_PLANNED : chr "ND" "N" "ND" "N" ...
## $ VERIFY_DATE : POSIXct, format: NA NA ...
## $ VERIFY_USER : chr NA NA NA "Jose Javier Sanchez" ...
## $ COMMENTS : chr NA NA NA NA ...
## $ FORM_DATE : POSIXct, format: "2022-03-29" "2022-03-30" ...
## $ FILLED_OUT_BY : chr "kxc672" "jjs2031" "jjs2031" "jjs2031" ...
dfDD <- read_excel(revisedDDpath, sheet = "ALZ_CLINICALSUM")
## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]
## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 3 × 2
## VarNames `Data Type`
## <chr> <chr>
## 1 REFCTR VARCHAR2(6)
## 2 LAST_CONTACT_DATE DATE
## 3 LAST_CONTACT_AGE NUMBER(2)
## select the vars to be converted to numeric
convert2num <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("NUMBER", dfDD$`Data Type`)] ## LAST_CONTACT_AGE
## select the vars to be converted to date
convert2date <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date",dfDD$`Data Type`,ignore.case = T)] ## LAST_CONTACT_DATE
## the rest should be converted to character
convert2chr <- setdiff(logicols,c(convert2num,convert2date)) ## REFCTR
## convert
df[convert2num] <- lapply(df[convert2num], as.numeric)
df[convert2date] <- lapply(df[convert2date], as.Date)
df[convert2chr] <- lapply(df[convert2chr], as.character)
## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "POSIXct" "POSIXt"
##
## [[4]]
## [1] "Date"
## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "DATE_OF_BIRTH" "VERIFY_DATE" "FORM_DATE"
## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]
## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## [1] "LAST_CONTACT_DATE" can ignore LAST_CONTACT_DATE, as it has been corrected in previous step
## [1] "LAST_CONTACT_DATE"
head(df[,datecols])
## DATE_OF_BIRTH VERIFY_DATE FORM_DATE
## 1 1973-01-14 <NA> 2022-03-29
## 2 1941-04-03 <NA> 2022-03-30
## 3 1955-02-08 <NA> 2022-03-28
## 4 1945-02-17 2022-05-18 2022-03-28
## 5 1958-03-11 2022-05-05 2022-03-29
## 6 1947-02-05 2022-05-05 2022-03-29
## convert format
df[datecols] <- lapply(df[datecols], as.Date)
## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric" "character" "Date"
## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 20 vars
## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]
mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## [1] "IMPRESSION" "AD_CATEGORY" "AD_HX_CATEGORY" "UNCLEAR_CATEGORY"
## after checking the unique values of variables in the mismatchChrs_1, I believe that they all should be characters
## so I updated the DD for those variables (I changed their data type in DD and switch the values from "Valid Responses" and " Valid Responses Codes" columns)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)
## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## the descrption for variable FILLED_OUT_BY mentioned this is Dropdown style for people to select, so I belive that multiple values are fine
## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 15 vars
## extract numeric variables from DD
## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]
mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)
unique(numColsfromDD$`Valid Responses`)
## [1] NA "1 thru 99999;" "1 thru 9999;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP
ALZ_CLINICALSUM<- df
df <- ALZ_CSDD
info(ALZ_CSDD,"SYSIND")
## #obs:181, cols:42, inds:176
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "logical"
##
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
## 'data.frame': 181 obs. of 42 variables:
## $ SYSXM : num 7555573 7557803 7551403 7550933 7551073 ...
## $ SYSIND : num 11006333 11039713 11048273 11063923 11048283 ...
## $ SYSGP : num 7888683 7896183 7894423 7894423 7894423 ...
## $ SYSGPSTUDY : num 1304023 1311503 1309743 1309743 1309743 ...
## $ SYSINDGP : num 7761133 7795453 7804133 7822853 7804143 ...
## $ CGI_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ GPS_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ STDCGI_ORDER : num 11 11 11 11 11 11 11 11 11 11 ...
## $ LSTUDY : chr "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" ...
## $ DB_OWNER : chr "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
## $ STUDY : chr "ALZ" "ALZ" "ALZ" "ALZ" ...
## $ SUBSTUDY : chr "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" ...
## $ CENTER : chr "IHG" "IHG" "IHG" "IHG" ...
## $ GP : num 87535 87657 87650 87650 87650 ...
## $ IND : num 1001 1 105 110 106 ...
## $ REFCTR : logi NA NA NA NA NA NA ...
## $ EXAM_DATE : POSIXct, format: "2018-04-13" "2018-04-17" ...
## $ EXAMINER : chr "axr1589" "axr1589" "axr1589" "axr1589" ...
## $ DATE_OF_BIRTH : POSIXct, format: "1932-02-20" "1947-04-23" ...
## $ AGE_AT_EXAM : num 86 70 77 86 82 87 87 85 75 80 ...
## $ ANXIETY : num 2 0 0 0 0 2 -1 0 2 1 ...
## $ SADNESS : num 0 0 0 0 0 0 -1 2 0 1 ...
## $ LACK_REACTION : num 0 1 2 0 0 -1 -1 0 1 1 ...
## $ IRRITABILITY : num 1 0 0 2 0 0 -1 0 2 1 ...
## $ AGITATION : num 1 0 0 0 0 2 -1 0 1 0 ...
## $ RETARDATION : num 0 2 1 1 0 2 -1 0 0 1 ...
## $ MULTI_COMPLAINTS: num 2 0 1 2 0 -1 -1 0 1 1 ...
## $ LOSS_INTEREST : num 1 0 2 0 0 -1 -1 0 2 0 ...
## $ LOSS_APPETITE : num 0 1 1 1 0 2 -1 1 1 1 ...
## $ LOSS_WEIGHT : num 0 0 2 2 1 2 -1 -1 1 0 ...
## $ LACK_ENERGY : num 2 0 2 1 1 2 -1 2 2 1 ...
## $ DIURNAL_MOOD : num 1 0 0 -1 0 2 -1 0 -1 0 ...
## $ DIFF_ASLEEP : num 0 0 0 0 0 2 -1 0 0 1 ...
## $ MULTI_AWAKEN : num 0 1 1 0 0 2 -1 0 0 0 ...
## $ EARLY_AWAKEN : num 0 0 1 0 1 0 -1 0 1 0 ...
## $ SUICIDAL : num 0 0 0 0 0 -1 -1 0 -1 0 ...
## $ SELF_ESTEEM : num 0 0 0 0 0 -1 -1 0 -1 0 ...
## $ PESSIMISM : num 0 0 0 0 0 -1 -1 0 -1 0 ...
## $ MOOD_DELUSIONS : num 0 0 0 0 0 -1 -1 0 -1 0 ...
## $ NOTES_MEDS : chr NA NA NA NA ...
## $ CSDD_SCORE : num 10 5 13 9 3 18 0 5 14 9 ...
## $ CSDD_COUNT : num 19 19 19 18 19 12 0 18 14 19 ...
dfDD <- read_excel(revisedDDpath, sheet = "ALZ_CSDD")
## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)] ## 1
## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 1 × 2
## VarNames `Data Type`
## <chr> <chr>
## 1 REFCTR VARCHAR2(6)
## converted to character
convert2chr <- c("REFCTR")
## convert
df[convert2chr] <- lapply(df[convert2chr], as.character)
## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "POSIXct" "POSIXt"
## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE" "DATE_OF_BIRTH"
## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]
## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## character(0)
## character(0)
head(df[,datecols])
## EXAM_DATE DATE_OF_BIRTH
## 1 2018-04-13 1932-02-20
## 2 2018-04-17 1947-04-23
## 3 2018-03-15 1940-06-24
## 4 2018-04-03 1931-07-01
## 5 2018-04-03 1935-05-25
## 6 2018-04-24 1930-06-19
## convert format
df[datecols] <- lapply(df[datecols], as.Date)
## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric" "character" "Date"
## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 8 vars
## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]
mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)
## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## All values are within valid ranges.
## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 32 vars
## extract numeric variables from DD
## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]
mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)
unique(numColsfromDD$`Valid Responses`)
## [1] NA "1 thru 99999;" "1 thru 9999;"
## [4] "-1;\r\n0;\r\n1;\r\n2;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP
ALZ_CSDD <- df
df <- ALZ_EXAM
info(ALZ_EXAM,"SYSIND")
## #obs:526, cols:80, inds:522
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "logical"
##
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
## 'data.frame': 526 obs. of 80 variables:
## $ SYSXM : num 7541263 7541363 7541493 7540523 7541543 ...
## $ SYSIND : num 11109753 11109763 11109783 11048913 11109793 ...
## $ SYSGP : num 7921103 7921113 7921133 7896183 7921143 ...
## $ SYSGPSTUDY : num 1359213 1359223 1359243 1311503 1359253 ...
## $ SYSINDGP : num 7869273 7869283 7869303 7804773 7869313 ...
## $ CGI_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ GPS_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ STDCGI_ORDER : num 11 11 11 11 11 11 11 11 11 11 ...
## $ LSTUDY : chr "ADCRLPRADI" "ADCONTROL" "ADCRLPRADI" "ADFAMPRADI" ...
## $ DB_OWNER : chr "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
## $ STUDY : chr "ALZ" "ALZ" "ALZ" "ALZ" ...
## $ SUBSTUDY : chr "ADCRLPRADI" "ADCONTROL" "ADCRLPRADI" "ADFAMPRADI" ...
## $ CENTER : chr "IHG" "IHG" "IHG" "IHG" ...
## $ GP : num 87787 87788 87790 87657 87791 ...
## $ IND : num 1 1 1 102 1 1 1 1 1 1 ...
## $ REFCTR : logi NA NA NA NA NA NA ...
## $ FORM_DATE : POSIXct, format: "2018-03-06" "2018-03-06" ...
## $ FILLED_OUT_BY : chr "v.rodriguez4" "bxf258" "v.rodriguez4" "v.rodriguez4" ...
## $ DATE_OF_BIRTH : POSIXct, format: "1950-06-30" "1956-12-21" ...
## $ NEURO_METHOD : chr "E" "E" "E" "E" ...
## $ NEURO_EXAM_DATE : POSIXct, format: "2018-03-06" "2018-03-06" ...
## $ NEURO_EXAMINER : chr "vanessa r" "briseida felicia" "vanessa" "Vanessa" ...
## $ MOOD_AFFECT : chr "N" "N" "A" "N" ...
## $ DEPRESSED : chr NA NA "Y" NA ...
## $ MANIC : chr NA NA "N" NA ...
## $ MOOD_OTHER : chr NA NA "N" NA ...
## $ MOOD_OTHER_DSC : chr NA NA NA NA ...
## $ SPEECH : chr "N" "N" "N" "N" ...
## $ DYSARTHRIA : chr NA NA NA NA ...
## $ DYSPHASIA : chr NA NA NA NA ...
## $ SPEECH_OTHER : chr NA NA NA NA ...
## $ SPEECH_OTHER_DSC : chr NA NA NA NA ...
## $ FACIAL_EXPRESSION : chr "N" "N" "N" "N" ...
## $ MASKED_FACE : chr NA NA NA NA ...
## $ FACIAL_OTHER : chr NA NA NA NA ...
## $ FACIAL_OTHER_DSC : chr NA NA NA NA ...
## $ OCULAR_MOVEMENT : chr "N" "N" "N" "N" ...
## $ IMPAIRED_UPGAZE : chr NA NA NA NA ...
## $ OCULAR_OTHER : chr NA NA NA NA ...
## $ OCULAR_OTHER_DSC : chr NA NA NA NA ...
## $ BRADY : chr "N" "N" "N" "N" ...
## $ BRADY_GLOBAL : chr NA NA NA NA ...
## $ SLOWED_RAMS : chr NA NA NA NA ...
## $ BRADY_OTHER : chr NA NA NA NA ...
## $ BRADY_OTHER_DSC : chr NA NA NA NA ...
## $ TREMOR : chr "Y" "N" "N" "N" ...
## $ TREMOR_RESTING : chr "N" NA NA NA ...
## $ TREMOR_ACTION : chr "Y" NA NA NA ...
## $ GAIT : chr "N" "N" "N" "N" ...
## $ DECR_ARM_SWING : chr NA NA NA NA ...
## $ SHUFFLING : chr NA NA NA NA ...
## $ MULTI_STEP : chr NA NA NA NA ...
## $ GAIT_OTHER : chr NA NA NA NA ...
## $ GAIT_OTHER_DSC : chr NA NA NA NA ...
## $ POST_STABILITY : chr "N" "N" "N" "N" ...
## $ MOTOR_TONE : chr "N" "N" "N" "N" ...
## $ RIGIDITY : chr NA NA NA NA ...
## $ COGWHEELING : chr NA NA NA NA ...
## $ SPASTICITY : chr NA NA NA NA ...
## $ FLACCIDITY : chr NA NA NA NA ...
## $ MOTOR_ASYM : chr "ND" "ND" "ND" "ND" ...
## $ REFLEXES_ASYM : chr "ND" "N" "ND" "ND" ...
## $ REFLEXES_HYPERACTIVE: chr "ND" "N" "ND" "ND" ...
## $ REFLEXES_DECREASED : chr "ND" "N" "ND" "ND" ...
## $ BABINSKI : chr "ND" "N" "ND" "ND" ...
## $ CLIN_METHOD : chr "E" "E" "E" "E" ...
## $ CLIN_EXAM_DATE : POSIXct, format: "2018-03-06" "2018-03-06" ...
## $ CLIN_EXAMINER : chr "vanessa r" "briseida" NA "Vanessa" ...
## $ PROG_APHASIA : chr "N" "U" "N" "N" ...
## $ AMNESIA : chr "N" "U" "N" "N" ...
## $ LUNG_DX : chr "N" "U" "N" "N" ...
## $ PREV_ARREST : chr "N" "U" "N" "N" ...
## $ SUBSTANCE_ABUSE : chr "Y" "U" "N" "N" ...
## $ SURGERY : chr "N" "U" "N" "N" ...
## $ VAS_DEMENTIA : chr "N" "U" "N" "N" ...
## $ PSY_DISORDER : chr "N" "U" "N" "N" ...
## $ FLUCT_COGNITION : chr "N" "U" "N" "N" ...
## $ DOPAMINE : chr "N" "U" "N" "N" ...
## $ DOPA_CURRENT : chr "N" "U" "N" "N" ...
## $ NEUROLEPTIC : chr "N" "U" "N" "U" ...
dfDD <- read_excel(revisedDDpath, sheet = "ALZ_EXAM")
## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]
## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 1 × 2
## VarNames `Data Type`
## <chr> <chr>
## 1 REFCTR VARCHAR2(6)
## converted to character
convert2chr <-c("REFCTR")
## convert
df[convert2chr] <- lapply(df[convert2chr], as.character)
## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "POSIXct" "POSIXt"
## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "FORM_DATE" "DATE_OF_BIRTH" "NEURO_EXAM_DATE" "CLIN_EXAM_DATE"
## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]
## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## character(0)
## character(0)
head(df[,datecols])
## FORM_DATE DATE_OF_BIRTH NEURO_EXAM_DATE CLIN_EXAM_DATE
## 1 2018-03-06 1950-06-30 2018-03-06 2018-03-06
## 2 2018-03-06 1956-12-21 2018-03-06 2018-03-06
## 3 2018-03-06 1946-10-29 2018-03-06 2018-03-06
## 4 2018-02-19 1946-01-11 2018-02-19 2018-02-19
## 5 2018-03-06 1949-10-06 2018-03-06 2018-03-06
## 6 2018-03-05 1938-11-06 2018-03-05 2018-03-05
## convert format
df[datecols] <- lapply(df[datecols], as.Date)
## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric" "character" "Date"
## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 66 vars
## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]
mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)
## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore NEURO_METHOD and FILLED_OUT_BY
## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 10 vars
## extract numeric variables from DD
## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]
mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)
unique(numColsfromDD$`Valid Responses`)
## [1] NA "1 thru 99999;" "1 thru 9999;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP
ALZ_EXAM <- df
df <- ALZ_GAI_SP
info(ALZ_GAI_SP,"SYSIND")
## #obs:19, cols:42, inds:19
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "logical"
##
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
## 'data.frame': 19 obs. of 42 variables:
## $ SYSXM : num 8095923 8103313 8089823 8065953 8066073 ...
## $ SYSIND : num 11008753 11147113 11008763 11358523 11369753 ...
## $ SYSGP : num 7888993 7922413 7888993 7945143 7951963 ...
## $ SYSGPSTUDY : num 1304333 1360523 1304333 1386053 1397073 ...
## $ SYSINDGP : num 7763553 7910223 7763563 8127793 8139023 ...
## $ CGI_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ GPS_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ STDCGI_ORDER : num 11 11 11 11 11 11 11 11 11 11 ...
## $ LSTUDY : chr "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" "ADCRLPRADI" ...
## $ DB_OWNER : chr "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
## $ STUDY : chr "ALZ" "ALZ" "ALZ" "ALZ" ...
## $ SUBSTUDY : chr "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" "ADCRLPRADI" ...
## $ CENTER : chr "IHG" "IHG" "IHG" "IHG" ...
## $ GP : num 87577 87858 87577 88247 88316 ...
## $ IND : num 9000 103 9001 1 1 ...
## $ REFCTR : logi NA NA NA NA NA NA ...
## $ EXAM_DATE : POSIXct, format: "2022-07-12" "2022-07-14" ...
## $ EXAMINER : chr "mxc2207" "jjs2031" "jjs2031" "mxc2207" ...
## $ DATE_OF_BIRTH : POSIXct, format: "1944-10-12" "1939-04-08" ...
## $ AGE_AT_EXAM : num 77 83 61 75 85 81 71 80 52 62 ...
## $ REVIEW_DATE : logi NA NA NA NA NA NA ...
## $ REVIEWER : logi NA NA NA NA NA NA ...
## $ WORRY_ALOT : num 0 0 0 0 0 0 0 0 0 0 ...
## $ DIFF_MAKE_DECISION : num 1 0 0 0 0 0 0 1 0 1 ...
## $ FEEL_JUMPY : num 1 0 0 0 0 0 0 1 0 0 ...
## $ HARD_TO_RELAX : num 0 0 0 0 0 0 0 1 0 0 ...
## $ CANNOT_ENJOY : num 1 0 0 0 0 0 0 0 0 0 ...
## $ THINGS_BOTHER_ME : num 1 0 0 0 0 0 0 0 0 1 ...
## $ BUTTERFLIES : num 0 0 0 0 0 0 0 0 0 0 ...
## $ WORRIER : num 0 0 0 1 1 1 0 1 1 1 ...
## $ RIVIAL_THINGS : num 0 0 0 1 0 0 0 1 1 1 ...
## $ OFTEN_NERVOUS : num 1 1 0 0 1 0 0 0 1 0 ...
## $ THOUGHTS_ANXIOUS : num 1 0 0 0 0 0 0 0 0 0 ...
## $ UPSET_STOMACH : num 0 0 0 0 0 0 0 0 0 0 ...
## $ THINK_MYSELF_NERVOUS: num 0 0 0 1 0 0 0 1 0 0 ...
## $ ANTICIPATE_WORST : num 1 0 0 0 0 0 0 1 0 0 ...
## $ FEEL_SHAKY : num 1 0 0 0 0 0 0 1 0 0 ...
## $ INTERFERE_WITH_LIFE : num 1 0 0 0 0 0 0 0 0 0 ...
## $ OVERWHELM : num 1 0 0 0 0 0 0 0 0 0 ...
## $ FEEL_GREAT_KNOT : num 1 0 0 0 0 0 0 0 0 0 ...
## $ MISS_OUT : num 1 0 0 0 1 0 0 0 0 0 ...
## $ FEEL_UPSET : num 1 0 0 0 0 0 0 0 0 0 ...
dfDD <- read_excel(revisedDDpath, sheet = "ALZ_GAI_SP")
## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]
## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 3 × 2
## VarNames `Data Type`
## <chr> <chr>
## 1 REFCTR VARCHAR2(6)
## 2 REVIEW_DATE date
## 3 REVIEWER VARCHAR
## converted to character
convert2chr <-c("REFCTR","REVIEWER")
convert2date <- c("REVIEW_DATE")
## convert
df[convert2chr] <- lapply(df[convert2chr], as.character)
df[convert2date] <- lapply(df[convert2date], as.Date)
## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "POSIXct" "POSIXt"
##
## [[4]]
## [1] "Date"
## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE" "DATE_OF_BIRTH"
## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]
## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## "REVIEW_DATE, ignore it, since it has been converted in previous step
## [1] "REVIEW_DATE"
head(df[,datecols])
## EXAM_DATE DATE_OF_BIRTH
## 1 2022-07-12 1944-10-12
## 2 2022-07-14 1939-04-08
## 3 2022-07-12 1961-06-12
## 4 2022-04-01 1946-10-06
## 5 2022-03-31 1936-12-21
## 6 2022-03-30 1940-06-12
## convert format
df[datecols] <- lapply(df[datecols], as.Date)
## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric" "character" "Date"
## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)]
## [1] "LSTUDY" "DB_OWNER" "STUDY" "SUBSTUDY" "CENTER" "REFCTR" "EXAMINER" "REVIEWER"
## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]
mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)
## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore EXAMINER, as I assume we can have multiple examiners
## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 31 vars
## extract numeric variables from DD
## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]
mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)
unique(numColsfromDD$`Valid Responses`)
## [1] NA "1 thru 99999;" "1 thru 9999;" "1;\r\n0;\r\n"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
DT::datatable(check_valid_numeric_responses(tmp,df))
## All numeric values are within valid ranges.
ALZ_GAI_SP <- df
df <- ALZ_LOAD_COG
info(ALZ_LOAD_COG,"SYSIND")
## #obs:1006, cols:41, inds:907
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "logical"
##
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
## 'data.frame': 1006 obs. of 41 variables:
## $ SYSXM : num 7540463 7540813 7540903 7540593 7541233 ...
## $ SYSIND : num 11006263 11059623 11059693 11048913 11109753 ...
## $ SYSGP : num 7888673 7897223 7897223 7896183 7921103 ...
## $ SYSGPSTUDY : num 1304013 1312543 1312543 1311503 1359213 ...
## $ SYSINDGP : num 7761063 7818553 7818623 7804773 7869273 ...
## $ CGI_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ GPS_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ STDCGI_ORDER : num 11 11 11 11 11 11 11 11 11 11 ...
## $ LSTUDY : chr "ADCRLPRADI" "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" ...
## $ DB_OWNER : chr "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
## $ STUDY : chr "ALZ" "ALZ" "ALZ" "ALZ" ...
## $ SUBSTUDY : chr "ADCRLPRADI" "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" ...
## $ CENTER : chr "IHG" "IHG" "IHG" "IHG" ...
## $ GP : num 87534 87699 87699 87657 87787 ...
## $ IND : num 104 101 108 102 1 1 1 1 1 1 ...
## $ REFCTR : logi NA NA NA NA NA NA ...
## $ INTERVIEW_DATE: POSIXct, format: "2018-02-21" "2018-02-18" ...
## $ INTERVIEWER : chr "v.rodriguez4" "axr1589" "axr1589" "v.rodriguez4" ...
## $ DATE_OF_BIRTH : POSIXct, format: "1936-09-20" "1929-10-08" ...
## $ INTERVIEW_AGE : num 81 88 68 72 67 61 68 68 79 65 ...
## $ VERSION : chr "2.0" "2.0" "2" "2.0" ...
## $ PHONE : num 1 1 1 1 1 1 1 1 1 1 ...
## $ STORY : num 6 3 4 6 18 19 9 1 9 12 ...
## $ DIGFOR : num 9 5 8 10 9 12 8 8 3 7 ...
## $ DIGBAK : num 6 5 7 4 7 6 4 0 2 5 ...
## $ ANIMALS : num 16 11 20 14 15 17 7 13 NA 13 ...
## $ FRUITS : logi NA NA NA NA NA NA ...
## $ VEG : num 13 6 12 5 14 7 5 7 NA 6 ...
## $ DIGORD : num 7 2 4 5 8 8 7 0 NA 7 ...
## $ DELAY : num 8 0 6 3 12 17 7 0 NA 5 ...
## $ HOWWELL : num NA NA 1 NA 1 1 1 9 1 1 ...
## $ HEARIMP : num NA NA 2 NA 2 2 2 2 2 2 ...
## $ STATUS : num 1 1 1 1 1 1 1 1 4 1 ...
## $ COMM : chr NA NA NA NA ...
## $ ANIMALS_REP : logi NA NA NA NA NA NA ...
## $ ANIMALS_INT : logi NA NA NA NA NA NA ...
## $ VEG_REP : logi NA NA NA NA NA NA ...
## $ VEG_INT : logi NA NA NA NA NA NA ...
## $ DIGFOR_LEN : num NA NA NA NA NA NA NA NA NA NA ...
## $ DIGBAK_LEN : num NA NA NA NA NA NA NA NA NA NA ...
## $ DELAY_LEN : num NA NA NA NA NA NA NA NA NA NA ...
dfDD <- read_excel(revisedDDpath, sheet = "ALZ_LOAD_COG")
## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]
## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 6 × 2
## VarNames `Data Type`
## <chr> <chr>
## 1 REFCTR VARCHAR2(6)
## 2 FRUITS NUMBER
## 3 ANIMALS_REP <NA>
## 4 ANIMALS_INT <NA>
## 5 VEG_REP <NA>
## 6 VEG_INT <NA>
## converted to character
convert2chr <-c("REFCTR")
convert2num <-c("FRUITS")
## for others, they are missing info in DD, I will leave them for now
## convert
df[convert2chr] <- lapply(df[convert2chr], as.character)
df[convert2num] <- lapply(df[convert2chr], as.numeric)
## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "POSIXct" "POSIXt"
##
## [[4]]
## [1] "logical"
## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE" "DATE_OF_BIRTH"
## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]
## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## character(0)
## character(0)
head(df[,datecols])
## INTERVIEW_DATE DATE_OF_BIRTH
## 1 2018-02-21 1936-09-20
## 2 2018-02-18 1929-10-08
## 3 2018-02-19 1949-08-01
## 4 2018-02-19 1946-01-11
## 5 2018-03-06 1950-06-30
## 6 2018-03-06 1956-12-21
## convert format
df[datecols] <- lapply(df[datecols], as.Date)
## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric" "character" "Date" "logical"
## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 9 vars
## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]
mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)
## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore INTERVIEWER
## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 26 vars
## extract numeric variables from DD
## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]
mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)
unique(numColsfromDD$`Valid Responses`)
## [1] NA
## [2] "1 thru 99999;"
## [3] "1 thru 9999;"
## [4] "1;\r\n2;\r\n8;\r\n9;"
## [5] "0 thru 25;\r\n96;\r\n97;\r\n98;\r\n99;"
## [6] "0 thru 12;\r\n96;\r\n97;\r\n98;\r\n99;"
## [7] "0 thru 75;\r\n96;\r\n97;\r\n98;\r\n99;"
## [8] "0 thru 16;\r\n96;\r\n97;\r\n98;\r\n99;"
## [9] "1;\r\n2;\r\n3;\r\n4;\r\n5;\r\n8;\r\n9;"
## [10] "1;\r\n2;"
## [11] "1;\r\n2;\r\n3;\r\n4;\r\n10;\r\n11;\r\n12;\r\n13;\r\n14;\r\n20;\r\n21;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP
ALZ_LOAD_COG <- df
df <- ALZ_NCRAD
info(ALZ_NCRAD,"SYSIND")
## #obs:743, cols:53, inds:742
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "logical"
##
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
## 'data.frame': 743 obs. of 53 variables:
## $ SYSXM : num 7895163 7879963 7879983 7880163 7880193 ...
## $ SYSIND : num 11218613 11041143 11041043 11039473 11005233 ...
## $ SYSGP : num 7928123 7894373 7894373 7896023 7888553 ...
## $ SYSGPSTUDY : num 1366233 1309693 1309693 1311343 1303893 ...
## $ SYSINDGP : num 7981883 7797003 7796903 7795213 7760033 ...
## $ CGI_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ GPS_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ STDCGI_ORDER : num 11 11 11 11 11 11 11 11 11 11 ...
## $ LSTUDY : chr "ADCRLPRADI" "ADFAMPRADI" "ADFAMPRADI" "ADCRLPRADI" ...
## $ DB_OWNER : chr "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
## $ STUDY : chr "ALZ" "ALZ" "ALZ" "ALZ" ...
## $ SUBSTUDY : chr "ADCRLPRADI" "ADFAMPRADI" "ADFAMPRADI" "ADCRLPRADI" ...
## $ CENTER : chr "IHG" "IHG" "IHG" "IHG" ...
## $ GP : num 87998 87502 87502 87506 87501 ...
## $ IND : num 1 102 100 1 1 1 1 1 1 1 ...
## $ REFCTR : logi NA NA NA NA NA NA ...
## $ QUALIFY : chr "Unknown" "Yes" "Yes" "Yes" ...
## $ FORM_DATE : POSIXct, format: "2021-02-01" "2018-04-13" ...
## $ FILLED_OUT_BY: chr "sjt82" "v.rodriguez4" "v.rodriguez4" "medical records" ...
## $ DATE_OF_BIRTH: POSIXct, format: "1943-09-22" "1950-10-02" ...
## $ IN_NCRAD : chr NA NA NA NA ...
## $ SAMPLED : num NA NA NA NA NA NA NA NA NA NA ...
## $ EDUC : num 14 12 16 9 12 1 5 16 3 15 ...
## $ VISIT : num NA NA NA NA NA NA NA NA NA NA ...
## $ COMREQ : num NA NA NA NA NA NA NA NA NA NA ...
## $ NOTDEMCI : num NA NA NA NA NA NA NA NA NA NA ...
## $ EVALMETH : num NA NA NA NA NA NA NA NA NA NA ...
## $ EVALYR : num NA NA NA NA NA NA NA NA NA NA ...
## $ CLDEMLEW : num NA NA NA NA NA NA NA NA NA NA ...
## $ COMDXAD : logi NA NA NA NA NA NA ...
## $ NONADDEM : logi NA NA NA NA NA NA ...
## $ COMDXNAD : logi NA NA NA NA NA NA ...
## $ AAOSYMP : num NA NA NA NA 1 1 NA NA NA NA ...
## $ STROKETY : logi NA NA NA NA NA NA ...
## $ STROKEAGE : logi NA NA NA NA NA NA ...
## $ HYPERAGE : num NA NA NA NA NA NA NA NA NA NA ...
## $ HEARTAGE : logi NA NA NA NA NA NA ...
## $ DIABETX : logi NA NA NA NA NA NA ...
## $ DIABETAG : logi NA NA NA NA NA NA ...
## $ PDCLINDX : logi NA NA NA NA NA NA ...
## $ PDAGE : logi NA NA NA NA NA NA ...
## $ DEPRTX : logi NA NA NA NA NA NA ...
## $ DEPRAGE : logi NA NA NA NA NA NA ...
## $ HEADAGE : logi NA NA NA NA NA NA ...
## $ ABUSEAGE : logi NA NA NA NA NA NA ...
## $ COM28_36 : logi NA NA NA NA NA NA ...
## $ COM_ANY : chr NA NA NA NA ...
## $ CONTROL : num NA NA NA NA NA NA NA NA NA NA ...
## $ CONTYPE : num NA NA NA NA NA NA NA NA NA NA ...
## $ RELDEM : num NA NA NA NA NA NA NA NA NA NA ...
## $ GENRSCH : num 2 2 2 2 2 2 2 2 2 2 ...
## $ UNCON_VAL : logi NA NA NA NA NA NA ...
## $ UNCON_UNIT : logi NA NA NA NA NA NA ...
dfDD <- read_excel(revisedDDpath, sheet = "ALZ_NCRAD")
## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]
## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 18 × 2
## VarNames `Data Type`
## <chr> <chr>
## 1 REFCTR VARCHAR2(6)
## 2 COMDXAD VARCHAR2(255)
## 3 NONADDEM NUMBER(2)
## 4 COMDXNAD VARCHAR2(255)
## 5 STROKETY NUMBER(1)
## 6 STROKEAGE NUMBER(3)
## 7 HEARTAGE NUMBER(3)
## 8 DIABETX NUMBER(1)
## 9 DIABETAG NUMBER(3)
## 10 PDCLINDX NUMBER(1)
## 11 PDAGE NUMBER(3)
## 12 DEPRTX NUMBER(1)
## 13 DEPRAGE NUMBER(3)
## 14 HEADAGE NUMBER(3)
## 15 ABUSEAGE NUMBER(3)
## 16 COM28_36 VARCHAR2(255)
## 17 UNCON_VAL NUMBER(3)
## 18 UNCON_UNIT VARCHAR2(7)
## converted to character
convert2num <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("NUMBER", dfDD$`Data Type`,ignore.case = T)] ## 13 vars
convert2chr <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("CHAR", dfDD$`Data Type`,ignore.case = T)] ## 5 vars
## convert
df[convert2num] <- lapply(df[convert2chr], as.numeric)
df[convert2chr] <- lapply(df[convert2chr], as.character)
## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "POSIXct" "POSIXt"
## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "FORM_DATE" "DATE_OF_BIRTH"
## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]
## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## character(0)
## character(0)
head(df[,datecols])
## FORM_DATE DATE_OF_BIRTH
## 1 2021-02-01 1943-09-22
## 2 2018-04-13 1950-10-02
## 3 2020-04-23 1949-04-30
## 4 2016-11-17 1933-03-03
## 5 2019-05-22 1937-10-24
## 6 2020-09-17 1935-10-25
## convert format
df[datecols] <- lapply(df[datecols], as.Date)
## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric" "character" "Date"
## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 14 vars
## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]
mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)
## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore FILLED_OUT_BY
## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 37 vars
## extract numeric variables from DD
## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]
mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)
unique(numColsfromDD$`Valid Responses`)
## [1] NA
## [2] "1 thru 99999;"
## [3] "1 thru 9999;"
## [4] "1;\r\n2;"
## [5] "0 thru 50;\r\n99;"
## [6] "1 thru 50;\r\n98;"
## [7] "1;\r\n2;\r\n3;\r\n9;"
## [8] "1;\r\n2;\r\n3;\r\n4;\r\n5;\r\n6;\r\n9;"
## [9] "1;\r\n2;\r\n3;\r\n4;\r\n5;\r\n6;\r\n7;\r\n9;"
## [10] "1930 thru 2020;\r\n9999;"
## [11] "1;\r\n2;\r\n9;"
## [12] "1;\r\n2;\r\n3;\r\n4;\r\n5;\r\n6;\r\n7;\r\n8;\r\n9;\r\n10;\r\n11;\r\n12;\r\n13;\r\n14;\r\n15;\r\n16;\r\n17;\r\n99;"
## [13] "1;\r\n2;\r\n3;\r\n4;\r\n5;\r\n6;\r\n7;\r\n8;\r\n9;"
## [14] "1 thru 80;\r\n999;"
## [15] "1;\r\n2;\r\n3;\r\n4;\r\n5;\r\n9;"
## [16] "1;\r\n2;\r\n3;\r\n4;\r\n9;\r\n"
## [17] "1;\r\n2; \r\n3;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP
ALZ_NCRAD <- df
df <- ALZ_NEURO_CDR
info(ALZ_NEURO_CDR,"SYSIND")
## #obs:1221, cols:30, inds:1102
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "logical"
##
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
## 'data.frame': 1221 obs. of 30 variables:
## $ SYSXM : num 7540623 7540773 7546423 7546433 7546863 ...
## $ SYSIND : num 11048883 11059623 11044293 11011053 11046873 ...
## $ SYSGP : num 7896183 7897223 7894093 7889553 7894313 ...
## $ SYSGPSTUDY : num 1311503 1312543 1309413 1304893 1309633 ...
## $ SYSINDGP : num 7804743 7818553 7800153 7766073 7802733 ...
## $ CGI_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ GPS_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ STDCGI_ORDER : num 11 11 11 11 11 11 11 11 11 11 ...
## $ LSTUDY : chr "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" ...
## $ DB_OWNER : chr "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
## $ STUDY : chr "ALZ" "ALZ" "ALZ" "ALZ" ...
## $ SUBSTUDY : chr "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" ...
## $ CENTER : chr "IHG" "IHG" "IHG" "IHG" ...
## $ GP : num 87657 87699 87604 87580 87620 ...
## $ IND : num 1000 101 104 9010 101 106 110 1 102 1 ...
## $ REFCTR : logi NA NA NA NA NA NA ...
## $ EXAM_DATE : POSIXct, format: "2017-02-19" "2018-02-18" ...
## $ EXAMINER : chr "axr1589" "axr1589" "avg55" "v.rodriguez4" ...
## $ DATE_OF_BIRTH: POSIXct, format: "1923-04-17" "1929-10-08" ...
## $ AGE_AT_EXAM : num 93 88 80 75 76 56 86 84 86 91 ...
## $ METHOD : chr "IP" "IP" "TE" "TE" ...
## $ RECONSTRUCTED: chr "U" "U" "N" "N" ...
## $ CDR_TOTAL : num 2 2 3 2 1 1 0.5 3 3 2 ...
## $ MEMORY : num 2 2 3 2 0.5 2 0.5 3 3 2 ...
## $ ORIENTATION : num 2 2 3 1 1 1 0.5 3 3 1 ...
## $ PROBLEM_SOLVE: num 2 1 3 3 1 2 0 3 3 3 ...
## $ COM_AFFAIR : num 2 1 3 2 0.5 1 0 3 3 1 ...
## $ HOME_HOBBIES : num 2 3 3 3 3 1 0.5 3 3 2 ...
## $ PERSONAL_CARE: num 3 2 3 2 2 1 0 3 3 2 ...
## $ CDR_COMM : chr NA NA "Too impaired to complete patient portion." "spoke with daughter about her mother, she is not able to keep a conversation. Barely functions with in the hous"| __truncated__ ...
dfDD <- read_excel(revisedDDpath, sheet = "ALZ_NEURO_CDR")
## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]
## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 1 × 2
## VarNames `Data Type`
## <chr> <chr>
## 1 REFCTR VARCHAR2(6)
## converted to character
convert2chr <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("CHAR", dfDD$`Data Type`,ignore.case = T)]
## convert
df[convert2chr] <- lapply(df[convert2chr], as.character)
## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "POSIXct" "POSIXt"
## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE" "DATE_OF_BIRTH"
## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]
## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## character(0)
## character(0)
head(df[,datecols])
## EXAM_DATE DATE_OF_BIRTH
## 1 2017-02-19 1923-04-17
## 2 2018-02-18 1929-10-08
## 3 2018-03-16 1937-04-09
## 4 2018-03-20 1942-07-16
## 5 2018-03-06 1942-02-05
## 6 2018-04-03 1961-10-19
## convert format
df[datecols] <- lapply(df[datecols], as.Date)
## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric" "character" "Date"
## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 10 vars
## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]
mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)
## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore FILLED_OUT_BY
## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 18 vars
## extract numeric variables from DD
## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]
mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)
unique(numColsfromDD$`Valid Responses`)
## [1] NA "1 thru 99999;" "1 thru 9999;"
## [4] "0.0 thru 3.4;\r\n-1;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP
ALZ_NEURO_CDR <- df
df <- ALZ_NPIQ_CBRS
info(ALZ_NPIQ_CBRS,"SYSIND")
## #obs:123, cols:116, inds:121
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "logical"
##
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
## 'data.frame': 123 obs. of 116 variables:
## $ SYSXM : num 7545813 7557843 7550923 7551043 7558333 ...
## $ SYSIND : num 11039643 11039713 11063923 11048283 11039953 ...
## $ SYSGP : num 7896143 7896183 7894423 7894423 7896303 ...
## $ SYSGPSTUDY : num 1311463 1311503 1309743 1309743 1311623 ...
## $ SYSINDGP : num 7795383 7795453 7822853 7804143 7795693 ...
## $ CGI_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ GPS_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ STDCGI_ORDER : num 11 11 11 11 11 11 11 11 11 11 ...
## $ LSTUDY : chr "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" ...
## $ DB_OWNER : chr "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
## $ STUDY : chr "ALZ" "ALZ" "ALZ" "ALZ" ...
## $ SUBSTUDY : chr "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" ...
## $ CENTER : chr "IHG" "IHG" "IHG" "IHG" ...
## $ GP : num 87654 87657 87650 87650 87663 ...
## $ IND : num 1 1 110 106 1 ...
## $ REFCTR : logi NA NA NA NA NA NA ...
## $ EXAM_DATE : POSIXct, format: "2018-03-19" "2018-04-17" ...
## $ EXAMINER : chr "avg55" "axr1589" "axr1589" "axr1589" ...
## $ DATE_OF_BIRTH : POSIXct, format: "1933-06-05" "1947-04-23" ...
## $ AGE_AT_EXAM : num 84 70 86 82 87 85 77 75 80 74 ...
## $ NPIQINF : chr "2" "1" "1" "1" ...
## $ NPIQINF_PRO : chr NA NA NA NA ...
## $ NPIQINF_OTH : chr NA NA NA NA ...
## $ NPIQINFA : num 1 1 1 1 1 1 1 1 1 0 ...
## $ NPIQINFB : num 3 3 3 3 3 3 3 3 3 3 ...
## $ NPIQTYPE : num 2 1 1 1 1 2 1 1 1 1 ...
## $ AGIT : num 1 0 0 0 0 0 0 1 0 1 ...
## $ AGITSEV : num 1 NA NA NA NA NA NA 2 NA 1 ...
## $ AGITATION_DIST: num NA NA NA NA NA NA NA 5 NA 1 ...
## $ DEPD : num 1 0 0 0 0 1 0 0 1 0 ...
## $ DEPDSEV : num 1 NA NA NA NA 3 NA NA 2 NA ...
## $ DEPRESS_DIST : num NA NA NA NA NA 5 NA NA 2 NA ...
## $ ANX : num 0 0 0 0 0 0 0 0 0 1 ...
## $ ANXSEV : num NA NA NA NA NA NA NA NA NA 2 ...
## $ ANXIETY_DIST : num NA NA NA NA NA NA NA NA NA 4 ...
## $ ELAT : num 0 0 0 0 0 0 0 1 0 0 ...
## $ ELATSEV : num NA NA NA NA NA NA NA 1 NA NA ...
## $ ELATION_DIST : num NA NA NA NA NA NA NA 0 NA NA ...
## $ APA : num 0 0 0 0 1 0 0 0 1 0 ...
## $ APASEV : num NA NA NA NA 3 NA NA NA 2 NA ...
## $ APATHY_DIST : num NA NA NA NA 0 NA NA NA 2 NA ...
## $ DISN : num 0 0 0 0 0 0 0 1 0 1 ...
## $ DISNSEV : num NA NA NA NA NA NA NA NA NA 2 ...
## $ DISINHIB_DIST : num NA NA NA NA NA NA NA NA NA 0 ...
## $ IRR : num 0 0 0 0 0 0 0 1 0 1 ...
## $ IRRSEV : num NA NA NA NA NA NA NA 3 NA 2 ...
## $ IRRIT_DIST : num NA NA NA NA NA NA NA 5 NA 5 ...
## $ MOT : num 1 0 0 0 0 0 0 0 0 1 ...
## $ MOTSEV : num 1 NA NA NA NA NA NA NA NA 3 ...
## $ MOTOR_DIST : num NA NA NA NA NA NA NA NA NA 0 ...
## $ NITE : num 0 1 0 0 0 0 0 0 0 1 ...
## $ NITESEV : num NA 2 NA NA NA NA NA NA NA 1 ...
## $ NIGHTTIME_DIST: num NA 2 NA NA NA NA NA NA NA 0 ...
## $ APP : num 1 1 1 0 1 0 0 0 0 1 ...
## $ APPSEV : num 2 2 1 NA 3 NA NA NA NA 3 ...
## $ APPETITE_DIST : num NA 0 0 NA 5 NA NA NA NA 2 ...
## $ DEL : num 0 0 0 0 0 0 0 0 0 0 ...
## $ DELSEV : num NA NA NA NA NA NA NA NA NA NA ...
## $ DELUSION_DIST : num NA NA NA NA NA NA NA NA NA NA ...
## $ PARA : num 0 0 0 0 0 0 NA NA NA 0 ...
## $ PARAC : num NA NA NA NA NA NA NA NA NA NA ...
## $ PARAB : num NA NA NA NA NA NA NA NA NA NA ...
## $ PARAD : num NA NA NA NA NA NA NA NA NA NA ...
## $ HALL : num 1 1 0 0 0 0 0 0 0 0 ...
## $ HALLSEV : num 1 1 NA NA NA NA NA NA NA NA ...
## $ HALLUCIN_DIST : num NA 0 NA NA NA NA NA NA NA NA ...
## $ AUDHALL : num 4 1 0 0 8 0 NA NA NA 0 ...
## $ AUDHALLC : num NA NA NA NA 9 NA NA NA NA NA ...
## $ AUDHALLB : num 1 1 NA NA 1 NA NA NA NA NA ...
## $ AUDHALLD : num 0 0 NA NA 0 NA NA NA NA NA ...
## $ VISHALL : num 0 1 NA NA 8 9 NA NA NA NA ...
## $ VISHALLB : num NA 1 NA NA 1 NA NA NA NA NA ...
## $ VISHALLC : num NA NA NA NA 9 NA NA NA NA NA ...
## $ VISHALLD : num NA 0 NA NA 0 NA NA NA NA NA ...
## $ MISIDP : num 2 0 9 9 9 9 NA NA NA 9 ...
## $ MISIDPB : num 1 NA NA NA NA NA NA NA NA NA ...
## $ MISIDPC : num NA NA NA NA NA NA NA NA NA NA ...
## $ MISIDPD : num 0 NA NA NA NA NA NA NA NA NA ...
## $ MISIDSEL : num 0 0 9 9 9 9 NA NA NA 0 ...
## $ MISIDSB : num NA NA NA NA NA NA NA NA NA NA ...
## $ MISIDSC : num NA NA NA NA NA NA NA NA NA NA ...
## $ MISIDSD : num NA NA NA NA NA NA NA NA NA NA ...
## $ MISIDT : num 4 0 9 9 9 9 NA NA NA 0 ...
## $ MISIDTB : num 1 NA NA NA NA NA NA NA NA NA ...
## $ MISIDTC : num NA NA NA NA NA NA NA NA NA NA ...
## $ MISIDTD : num 0 NA NA NA NA NA NA NA NA NA ...
## $ INFID : num 0 0 9 9 9 9 NA NA NA 9 ...
## $ INFIDB : num NA NA NA NA NA NA NA NA NA NA ...
## $ INFIDC : num NA NA NA NA NA NA NA NA NA NA ...
## $ INFIDD : num NA NA NA NA NA NA NA NA NA NA ...
## $ ABND : num 0 0 9 9 9 9 NA NA NA 0 ...
## $ ABNDB : num NA NA NA NA NA NA NA NA NA NA ...
## $ ABNDC : num NA NA NA NA NA NA NA NA NA NA ...
## $ ABNDD : num NA NA NA NA NA NA NA NA NA NA ...
## $ IMP : num 0 0 9 9 9 9 NA NA NA 0 ...
## $ IMPB : num NA NA NA NA NA NA NA NA NA NA ...
## $ IMPC : num NA NA NA NA NA NA NA NA NA NA ...
## $ IMPD : num NA NA NA NA NA NA NA NA NA NA ...
## $ TVR : num 0 0 9 9 9 9 NA NA NA 0 ...
## $ TVRB : num NA NA NA NA NA NA NA NA NA NA ...
## $ TVRC : num NA NA NA NA NA NA NA NA NA NA ...
## $ TVRD : num NA NA NA NA NA NA NA NA NA NA ...
## $ OPIH : num 4 0 9 9 9 9 NA NA NA 0 ...
## $ OPIHB : num 1 NA NA NA NA NA NA NA NA NA ...
## $ OPIHC : num NA NA NA NA NA NA NA NA NA NA ...
## $ OPIHD : num 0 NA NA NA NA NA NA NA NA NA ...
## $ DPSA : num 4 0 9 9 9 9 NA NA NA 0 ...
## $ DPSAB : num 9 NA NA NA NA NA NA NA NA NA ...
## $ DPSAC : num NA NA NA NA NA NA NA NA NA NA ...
## $ DPSAD : num 0 NA NA NA NA NA NA NA NA NA ...
## $ HNH : num 2 0 9 9 9 9 NA NA NA 0 ...
## $ HNHB : num 1 NA NA NA NA NA NA NA NA NA ...
## $ HNHC : num NA NA NA NA NA NA NA NA NA NA ...
## $ HNHD : num 0 NA NA NA NA NA NA NA NA NA ...
## $ INTQUAL : num 0 0 0 0 0 0 NA NA NA 0 ...
## $ NOTES : chr NA NA NA NA ...
dfDD <- read_excel(revisedDDpath, sheet = "ALZ_NPIQ_CBRS")
## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]
## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 1 × 2
## VarNames `Data Type`
## <chr> <chr>
## 1 REFCTR VARCHAR2(6)
## converted to character
convert2chr <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("CHAR", dfDD$`Data Type`,ignore.case = T)]
## convert
df[convert2chr] <- lapply(df[convert2chr], as.character)
## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "POSIXct" "POSIXt"
## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE" "DATE_OF_BIRTH"
## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]
## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## character(0)
## character(0)
head(df[,datecols])
## EXAM_DATE DATE_OF_BIRTH
## 1 2018-03-19 1933-06-05
## 2 2018-04-17 1947-04-23
## 3 2018-04-03 1931-07-01
## 4 2018-04-03 1935-05-25
## 5 2018-04-24 1930-06-19
## 6 2018-04-25 1933-03-11
## convert format
df[datecols] <- lapply(df[datecols], as.Date)
## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric" "character" "Date"
## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 11 vars
## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]
mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)
## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore NPIQINF, since it can be multiple values as specified in the DD
## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 103 vars
## extract numeric variables from DD
## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]
mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)
unique(numColsfromDD$`Valid Responses`)
## [1] NA
## [2] "1 thru 99999;"
## [3] "1 thru 9999;"
## [4] "0;\r\n1;"
## [5] "1;\r\n2;\r\n3;"
## [6] "1;\r\n2;"
## [7] "1;\r\n0;"
## [8] "0;\r\n1;\r\n2;\r\n3;\r\n4;\r\n5;"
## [9] "1;\r\n2;\r\n3;\r\n4;\r\n9;\r\n0;\r\n8;"
## [10] "1;\r\n2;\r\n3;\r\n4;\r\n9;"
## [11] "0;\r\n1;\r\n9;"
## [12] "0;\r\n1;\r\n2;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP
ALZ_NPIQ_CBRS <- df
df <- ALZ_RPFQ
info(ALZ_RPFQ,"SYSIND")
## #obs:132, cols:67, inds:132
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "logical"
##
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
## 'data.frame': 132 obs. of 67 variables:
## $ SYSXM : num 7895173 8010153 8011643 8012863 8001143 ...
## $ SYSIND : num 11218613 11109763 11447143 11458753 11248653 ...
## $ SYSGP : num 7928123 7921113 7968293 7974313 7931713 ...
## $ SYSGPSTUDY : num 1366233 1359223 1413403 1419423 1370023 ...
## $ SYSINDGP : num 7981883 7869283 8216213 8227823 8012383 ...
## $ CGI_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ GPS_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ STDCGI_ORDER : num 11 11 11 11 11 11 11 11 11 11 ...
## $ LSTUDY : chr "ADCRLPRADI" "ADCONTROL" "ADCONTROL" "ADCONTROL" ...
## $ DB_OWNER : chr "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
## $ STUDY : chr "ALZ" "ALZ" "ALZ" "ALZ" ...
## $ SUBSTUDY : chr "ADCRLPRADI" "ADCONTROL" "ADCONTROL" "ADCONTROL" ...
## $ CENTER : chr "IHG" "IHG" "IHG" "IHG" ...
## $ GP : num 87998 87788 88462 88466 88118 ...
## $ IND : num 1 1 100 1 1 1 115 100 1 1 ...
## $ REFCTR : logi NA NA NA NA NA NA ...
## $ EXAM_DATE : POSIXct, format: "2021-02-01" "2021-11-15" ...
## $ EXAMINER : chr "sjt82" "jjs2031" "jjs2031" "mxc2207" ...
## $ DATE_OF_BIRTH : POSIXct, format: "1943-09-22" "1956-12-21" ...
## $ AGE_AT_EXAM : num 77 64 70 72 77 63 76 72 83 71 ...
## $ REVIEW_DATE : logi NA NA NA NA NA NA ...
## $ REVIEWER : logi NA NA NA NA NA NA ...
## $ SMOKE : num 2 1 1 1 1 2 2 1 2 1 ...
## $ SMOKE_AGE_START : num NA 15 15 16 12 NA NA 18 NA 20 ...
## $ SMOKE_CURR : num NA 2 2 2 2 NA NA 2 NA 2 ...
## $ SMOKE_AGE_STOP : num NA 64 40 68 73 NA NA 40 NA 50 ...
## $ PREGNANCIES : num NA NA 6 2 NA NA 1 NA NA NA ...
## $ LIVE_KIDS : num NA NA 4 2 NA NA 1 NA NA NA ...
## $ HRT : num NA NA 2 2 NA NA 2 NA 9 NA ...
## $ HRT_AGE_START : num NA NA NA NA NA NA NA NA NA NA ...
## $ HRT_AGE_STOP : num NA NA NA NA NA NA NA NA NA NA ...
## $ HRT_YEARS : num NA NA NA NA NA NA NA NA NA NA ...
## $ HYSTERECTOMY : num NA NA 2 1 NA NA 2 NA 9 NA ...
## $ HYSTERECTOMY_AGE : num NA NA NA 48 NA NA NA NA NA NA ...
## $ OVARIES_RMV : num NA NA 2 2 NA NA 2 NA 9 NA ...
## $ OVARIES_RMV_AGE : num NA NA NA NA NA NA NA NA NA NA ...
## $ OVARIES_RMV_BOTH : num NA NA NA NA NA NA NA NA NA NA ...
## $ HRT_OVR_RMV : num NA NA NA NA NA NA NA NA NA NA ...
## $ PHYSICAL_ACTIVITIES: num NA 1 0 0 0 1 0 0 0 0 ...
## $ NOPA_REASON : num NA NA 2 0 1 NA 0 1 1 1 ...
## $ VA_PAST2W : num NA 0 0 0 0 0 0 0 NA 1 ...
## $ VA_PAST2W_TIMES : num NA NA NA NA NA NA NA NA NA 1 ...
## $ VA_PAST2W_MINS : num NA NA NA NA NA NA NA NA NA 60 ...
## $ MA_PAST2W : num NA 0 0 0 0 0 0 0 NA 0 ...
## $ MA_PAST2W_TIMES : num NA NA NA NA NA NA NA NA NA NA ...
## $ MA_PAST2W_MINS : num NA NA NA NA NA NA NA NA NA NA ...
## $ LA_PAST2W : num NA 1 0 0 0 1 0 0 NA 1 ...
## $ LA_PAST2W_TIMES : num NA 2 NA NA NA 14 NA NA NA 1 ...
## $ LA_PAST2W_MINS : num NA 15 NA NA NA 30 NA NA NA 60 ...
## $ VA_AR13 : num NA 1 0 1 1 1 0 0 NA 1 ...
## $ VA_AR13_LEVEL : chr NA "V" NA "A" ...
## $ MA_AR13 : num NA 1 1 1 1 1 0 0 NA 1 ...
## $ MA_AR13_LEVEL : chr NA "V" "V" "A" ...
## $ LA_AR13 : num NA 1 1 1 1 1 0 0 NA 1 ...
## $ LA_AR13_LEVEL : chr NA "V" "V" "A" ...
## $ VA_AR24 : num NA 0 0 1 1 0 0 0 NA 1 ...
## $ VA_AR24_LEVEL : chr NA NA NA "V" ...
## $ MA_AR24 : num NA 1 1 1 1 0 0 0 NA 1 ...
## $ MA_AR24_LEVEL : chr NA "F" "V" "V" ...
## $ LA_AR24 : num NA 1 1 1 1 1 0 0 NA 1 ...
## $ LA_AR24_LEVEL : chr NA "F" "V" "V" ...
## $ VA_AR50 : num NA 0 0 0 0 0 0 0 NA 1 ...
## $ VA_AR50_LEVEL : chr NA NA NA NA ...
## $ MA_AR50 : num NA 0 0 0 1 0 0 0 NA 1 ...
## $ MA_AR50_LEVEL : chr NA NA NA NA ...
## $ LA_AR50 : num NA 1 1 1 1 1 0 0 NA 1 ...
## $ LA_AR50_LEVEL : chr NA "V" "F" "V" ...
dfDD <- read_excel(revisedDDpath, sheet = "ALZ_RPFQ")
## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]
## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 3 × 2
## VarNames `Data Type`
## <chr> <chr>
## 1 REFCTR VARCHAR2(6)
## 2 REVIEW_DATE date
## 3 REVIEWER VARCHAR
## converted to character
convert2chr <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("CHAR", dfDD$`Data Type`,ignore.case = T)]
convert2date <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date", dfDD$`Data Type`,ignore.case = T)]
## convert
df[convert2chr] <- lapply(df[convert2chr], as.character)
df[convert2date] <- lapply(df[convert2date], as.Date)
## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "POSIXct" "POSIXt"
##
## [[4]]
## [1] "Date"
## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE" "DATE_OF_BIRTH"
## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]
## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## REVIEW_DATE, ignore it, it has been corrected in previous step
## [1] "REVIEW_DATE"
head(df[,datecols])
## EXAM_DATE DATE_OF_BIRTH
## 1 2021-02-01 1943-09-22
## 2 2021-11-15 1956-12-21
## 3 2021-08-18 1951-03-04
## 4 2021-12-06 1949-06-04
## 5 2021-09-09 1944-01-03
## 6 2021-11-15 1958-03-03
## convert format
df[datecols] <- lapply(df[datecols], as.Date)
## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric" "character" "Date"
## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 17 vars
## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]
mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)
## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore EXAMINER
## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 47 vars
## extract numeric variables from DD
## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]
mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)
unique(numColsfromDD$`Valid Responses`)
## [1] NA "1 thru 99999;"
## [3] "1 thru 9999;" "1;\r\n2;\r\n9;"
## [5] "0;\r\n1;\r\n9;" "0;\r\n1;\r\n2;\r\n3;\r\n4;\r\n5;"
## [7] "0;\r\n1;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP
ALZ_RPFQ <- df
df <- ALZ_SCREENING
info(ALZ_SCREENING,"SYSIND")
## #obs:279, cols:49, inds:272
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "logical"
##
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
## 'data.frame': 279 obs. of 49 variables:
## $ SYSXM : num 7178373 7178243 7178253 7178263 7178273 ...
## $ SYSIND : num 1.1e+07 1.1e+07 1.1e+07 1.1e+07 1.1e+07 ...
## $ SYSGP : num 7894403 7894393 7896003 7896013 7896093 ...
## $ SYSGPSTUDY : num 1309723 1309713 1311323 1311333 1311413 ...
## $ SYSINDGP : num 7793363 7793333 7795173 7795203 7795323 ...
## $ CGI_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ GPS_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ STDCGI_ORDER : num 11 11 11 11 11 11 11 11 11 11 ...
## $ LSTUDY : chr "ADCRLPRADI" "ADCRLPRADI" "ADCRLPRADI" "ADCRLPRADI" ...
## $ DB_OWNER : chr "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
## $ STUDY : chr "ALZ" "ALZ" "ALZ" "ALZ" ...
## $ SUBSTUDY : chr "ADCRLPRADI" "ADCRLPRADI" "ADCRLPRADI" "ADCRLPRADI" ...
## $ CENTER : chr "IHG" "IHG" "IHG" "IHG" ...
## $ GP : num 87648 87503 87504 87505 87512 ...
## $ IND : num 101 1 1 9000 1 ...
## $ REFCTR : logi NA NA NA NA NA NA ...
## $ FORM_DATE : POSIXct, format: "2017-07-18" "2017-07-14" ...
## $ FILLED_OUT_BY : chr "axr1589" "axr1589" "axr1589" "axr1589" ...
## $ DATE_OF_BIRTH : POSIXct, format: "1948-02-01" "1939-01-13" ...
## $ LUMBAR_YES_NO : chr "N" "N" "N" "N" ...
## $ LUMBAR_DATE : POSIXct, format: NA NA ...
## $ LUMBAR_NO_DATE : chr NA NA NA NA ...
## $ LUMBAR_PUNCTURE : chr NA NA NA NA ...
## $ BRAIN_MRI_YES_NO : chr "N" "Y" "N" "N" ...
## $ BRAIN_MRI_DATE : POSIXct, format: NA NA ...
## $ BRAIN_MRI_NO_DATE: chr NA NA NA NA ...
## $ BRAIN_MRI : chr NA "NL" NA NA ...
## $ BRAIN_CT_YES_NO : chr "N" "N" "N" "N" ...
## $ BRAIN_CT_DATE : POSIXct, format: NA NA ...
## $ BRAIN_CT_NO_DATE : chr NA NA NA NA ...
## $ BRAIN_CT : chr NA NA NA NA ...
## $ EEG_YES_NO : chr "N" "N" "N" "N" ...
## $ EEG_DATE : POSIXct, format: NA NA ...
## $ EEG_NO_DATE : chr NA NA NA NA ...
## $ EEG : chr NA NA NA NA ...
## $ PET_SP_YES_NO : chr "N" "N" "N" "N" ...
## $ PET_SP_DATE : POSIXct, format: NA NA ...
## $ PET_SP_NO_DATE : chr NA NA NA NA ...
## $ PET_SP : chr NA NA NA NA ...
## $ BRAIN_BIO_YES_NO : chr "N" "N" "N" "N" ...
## $ BRAIN_BIO_DATE : logi NA NA NA NA NA NA ...
## $ BRAIN_BIO_NO_DATE: logi NA NA NA NA NA NA ...
## $ BRAIN_BIO : logi NA NA NA NA NA NA ...
## $ LUMB_NOTES : logi NA NA NA NA NA NA ...
## $ BRNMRI_NOTES : logi NA NA NA NA NA NA ...
## $ BRNCT_NOTES : logi NA NA NA NA NA NA ...
## $ EEG_NOTES : logi NA NA NA NA NA NA ...
## $ PETSP_NOTES : logi NA NA NA NA NA NA ...
## $ BRNBIO_NOTES : logi NA NA NA NA NA NA ...
dfDD <- read_excel(revisedDDpath, sheet = "ALZ_SCREENING")
## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]
## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 10 × 2
## VarNames `Data Type`
## <chr> <chr>
## 1 REFCTR VARCHAR2(6)
## 2 BRAIN_BIO_DATE DATE
## 3 BRAIN_BIO_NO_DATE CHAR(2)
## 4 BRAIN_BIO CHAR(2)
## 5 LUMB_NOTES VARCHAR2(4000)
## 6 BRNMRI_NOTES VARCHAR2(4000)
## 7 BRNCT_NOTES VARCHAR2(4000)
## 8 EEG_NOTES VARCHAR2(4000)
## 9 PETSP_NOTES VARCHAR2(4000)
## 10 BRNBIO_NOTES VARCHAR2(4000)
## converted to character
convert2chr <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("CHAR", dfDD$`Data Type`)]
convert2date <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("DATE", dfDD$`Data Type`)]
## convert
df[convert2chr] <- lapply(df[convert2chr], as.character)
df[convert2date] <- lapply(df[convert2date], as.Date)
## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "POSIXct" "POSIXt"
##
## [[4]]
## [1] "Date"
## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "FORM_DATE" "DATE_OF_BIRTH" "LUMBAR_DATE" "BRAIN_MRI_DATE" "BRAIN_CT_DATE" "EEG_DATE" "PET_SP_DATE"
## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]
## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## "BRAIN_BIO_DATE", ignore it, it has been corrected in previous step
## [1] "BRAIN_BIO_DATE"
head(df[,datecols])
## FORM_DATE DATE_OF_BIRTH LUMBAR_DATE BRAIN_MRI_DATE BRAIN_CT_DATE EEG_DATE
## 1 2017-07-18 1948-02-01 <NA> <NA> <NA> <NA>
## 2 2017-07-14 1939-01-13 <NA> <NA> <NA> <NA>
## 3 2017-07-14 1944-10-03 <NA> <NA> <NA> <NA>
## 4 2017-07-14 1960-10-23 <NA> <NA> <NA> <NA>
## 5 2017-07-14 1940-11-18 <NA> <NA> <NA> <NA>
## 6 2017-07-14 1946-10-04 <NA> <NA> <NA> <NA>
## PET_SP_DATE
## 1 <NA>
## 2 <NA>
## 3 <NA>
## 4 <NA>
## 5 <NA>
## 6 <NA>
## convert format
df[datecols] <- lapply(df[datecols], as.Date)
## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric" "character" "Date"
## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 31 vars
## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]
mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)
## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore FILLED_OUT_BY
## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 10 vars
## extract numeric variables from DD
## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]
mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)
unique(numColsfromDD$`Valid Responses`)
## [1] NA "1 thru 99999;" "1 thru 9999;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP
ALZ_SCREENING <- df
df <- ALZ_SCREENING_RC
info(ALZ_SCREENING_RC,"SYSIND")
## #obs:556, cols:61, inds:552
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "logical"
##
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
## 'data.frame': 556 obs. of 61 variables:
## $ SYSXM : num 8258773 8258813 8260093 8277633 8278003 ...
## $ SYSIND : num 11037673 11369813 11362953 11638763 11621333 ...
## $ SYSGP : num 7894423 7952013 7946353 8007323 8006293 ...
## $ SYSGPSTUDY : num 1309743 1397123 1387463 1454033 1453003 ...
## $ SYSINDGP : num 7793413 8139083 8132223 8407833 8390403 ...
## $ CGI_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ GPS_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ STDCGI_ORDER : num 11 11 11 11 11 11 11 11 11 11 ...
## $ LSTUDY : chr "ADFAMPRADI" "ADCRLPRADI" "ADFAMPRADI" "ADCONTROL" ...
## $ DB_OWNER : chr "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
## $ STUDY : chr "ALZ" "ALZ" "ALZ" "ALZ" ...
## $ SUBSTUDY : chr "ADFAMPRADI" "ADCRLPRADI" "ADFAMPRADI" "ADCONTROL" ...
## $ CENTER : chr "IHG" "IHG" "IHG" "IHG" ...
## $ GP : num 87650 88301 87545 104540 104528 ...
## $ IND : num 9000 1 106 1 1 ...
## $ REFCTR : logi NA NA NA NA NA NA ...
## $ EXAM_DATE : POSIXct, format: "2023-10-24" "2024-02-13" ...
## $ EXAMINER : chr "gsv32" "jjs2031" "jjs2031" "gsv32" ...
## $ DATE_OF_BIRTH : POSIXct, format: "1954-10-29" "1947-05-13" ...
## $ AGE_AT_EXAM : num 68 76 66 86 86 67 60 81 77 62 ...
## $ REVIEW_DATE : logi NA NA NA NA NA NA ...
## $ REVIEWER : logi NA NA NA NA NA NA ...
## $ LUMB_YN : chr "N" "N" "N" "N" ...
## $ LUMB_DT : POSIXct, format: NA NA ...
## $ LUMB_PUNC : chr NA NA NA NA ...
## $ LUMB_NOTES : chr NA NA NA NA ...
## $ BRNMRI_YN : chr "Y" "N" "Y" "N" ...
## $ BRNMRI_DT : POSIXct, format: "2017-10-01" NA ...
## $ BRAIN_MRI : chr "NL" NA "AC" NA ...
## $ BRNMRI_NOTES : chr NA NA "NO DATE AVAILABLE" NA ...
## $ BRNCT_YN : chr "Y" "N" "N" "N" ...
## $ BRNCT_DT : POSIXct, format: "2017-10-01" NA ...
## $ BRAIN_CT : chr "NL" NA NA NA ...
## $ BRNCT_NOTES : chr NA NA NA NA ...
## $ EEG_YN : chr "Y" "N" "N" "N" ...
## $ EEG_DT : POSIXct, format: "2017-10-01" NA ...
## $ EEG : chr "NL" NA NA NA ...
## $ EEG_NOTES : chr NA NA NA NA ...
## $ PETSP_YN : chr "N" "N" "N" "N" ...
## $ PETSP_DT : POSIXct, format: NA NA ...
## $ PET_SPECT : chr NA NA NA NA ...
## $ PETSP_NOTES : chr NA NA NA NA ...
## $ BRNBIO_YN : chr "N" "N" "N" "N" ...
## $ BRNBIO_DT : logi NA NA NA NA NA NA ...
## $ BRAIN_BIO : logi NA NA NA NA NA NA ...
## $ BRNBIO_NOTES : logi NA NA NA NA NA NA ...
## $ PRIOR_SCORE_MMSE1 : logi NA NA NA NA NA NA ...
## $ DATE_MMSE1 : logi NA NA NA NA NA NA ...
## $ PRIOR_SCORE_MOCA1 : logi NA NA NA NA NA NA ...
## $ DATE_MOCA1 : logi NA NA NA NA NA NA ...
## $ PRIOR_SC_BROOKE1 : logi NA NA NA NA NA NA ...
## $ DATE_BROOKE1 : logi NA NA NA NA NA NA ...
## $ PRIOR_SC_CHIF1 : logi NA NA NA NA NA NA ...
## $ DATE_CHIF1 : logi NA NA NA NA NA NA ...
## $ PRIOR_SC_WORDLIST1: logi NA NA NA NA NA NA ...
## $ DATE_WORDLIST1 : logi NA NA NA NA NA NA ...
## $ OTHER_TEST1 : logi NA NA NA NA NA NA ...
## $ DATE_OTHER_TEST1 : logi NA NA NA NA NA NA ...
## $ PRIOR_CLASSIF1 : logi NA NA NA NA NA NA ...
## $ PRIOR_ASSESS_NOTE1: logi NA NA NA NA NA NA ...
## $ NOTE_ALZ_SCREEN : chr NA NA NA NA ...
dfDD <- read_excel(revisedDDpath, sheet = "ALZ_SCREENING_RC")
## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]
## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 20 × 2
## VarNames `Data Type`
## <chr> <chr>
## 1 REFCTR VARCHAR2(6)
## 2 REVIEW_DATE date
## 3 REVIEWER VARCHAR
## 4 BRNBIO_DT DATE
## 5 BRAIN_BIO CHAR(2)
## 6 BRNBIO_NOTES VARCHAR2(4000)
## 7 PRIOR_SCORE_MMSE1 NUMBER(3)
## 8 DATE_MMSE1 DATE
## 9 PRIOR_SCORE_MOCA1 NUMBER(3)
## 10 DATE_MOCA1 DATE
## 11 PRIOR_SC_BROOKE1 NUMBER(3)
## 12 DATE_BROOKE1 DATE
## 13 PRIOR_SC_CHIF1 NUMBER(3)
## 14 DATE_CHIF1 DATE
## 15 PRIOR_SC_WORDLIST1 NUMBER(3)
## 16 DATE_WORDLIST1 DATE
## 17 OTHER_TEST1 NUMBER(3)
## 18 DATE_OTHER_TEST1 DATE
## 19 PRIOR_CLASSIF1 VARCHAR2(50)
## 20 PRIOR_ASSESS_NOTE1 VARCHAR2(150)
## converted to character
convert2chr <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("CHAR", dfDD$`Data Type`,,ignore.case = T)] ## 6 vars
convert2date <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date", dfDD$`Data Type`,ignore.case = T)] ## 8 vars
convert2num <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("NUMBER", dfDD$`Data Type`,ignore.case = T)] ## 6 vars
## convert
df[convert2chr] <- lapply(df[convert2chr], as.character)
df[convert2date] <- lapply(df[convert2date], as.Date)
df[convert2num] <- lapply(df[convert2chr], as.numeric)
## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "POSIXct" "POSIXt"
##
## [[4]]
## [1] "Date"
## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE" "DATE_OF_BIRTH" "LUMB_DT" "BRNMRI_DT" "BRNCT_DT" "EEG_DT" "PETSP_DT"
## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]
## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols)
## [1] "REVIEW_DATE" "BRNBIO_DT" "DATE_MMSE1" "DATE_MOCA1"
## [5] "DATE_BROOKE1" "DATE_CHIF1" "DATE_WORDLIST1" "DATE_OTHER_TEST1"
# [1] "REVIEW_DATE" "BRNBIO_DT" "DATE_MMSE1" "DATE_MOCA1" "DATE_BROOKE1" "DATE_CHIF1"
# [7] "DATE_WORDLIST1" "DATE_OTHER_TEST1"
## these variables have been corrected in previous step
head(df[,datecols])
## EXAM_DATE DATE_OF_BIRTH LUMB_DT BRNMRI_DT BRNCT_DT EEG_DT PETSP_DT
## 1 2023-10-24 1954-10-29 <NA> 2017-10-01 2017-10-01 2017-10-01 <NA>
## 2 2024-02-13 1947-05-13 <NA> <NA> <NA> <NA> <NA>
## 3 2024-02-20 1957-08-05 <NA> <NA> <NA> <NA> <NA>
## 4 2023-09-13 1937-08-13 <NA> <NA> <NA> <NA> <NA>
## 5 2023-05-09 1936-05-22 <NA> <NA> <NA> <NA> <NA>
## 6 2023-08-16 1956-01-09 <NA> <NA> <NA> <NA> <NA>
## convert format
df[datecols] <- lapply(df[datecols], as.Date)
## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric" "character" "Date"
## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 29 vars
## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]
mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)
## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore EXAMINER
## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 17 vars
## extract numeric variables from DD
## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]
mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)
unique(numColsfromDD$`Valid Responses`)
## [1] NA "1 thru 99999;" "1 thru 9999;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP
ALZ_SCREENING_RC <- df
df <- ALZ_STICK_D_RC
info(ALZ_STICK_D_RC,"SYSIND")
## #obs:430, cols:46, inds:428
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "logical"
##
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
## 'data.frame': 430 obs. of 46 variables:
## $ SYSXM : num 8275873 8258963 8259113 8277733 8277873 ...
## $ SYSIND : num 11160523 11369813 11037673 11435853 11638763 ...
## $ SYSGP : num 7923793 7952013 7894423 7962813 8007323 ...
## $ SYSGPSTUDY : num 1361903 1397123 1309743 1407923 1454033 ...
## $ SYSINDGP : num 7923633 8139083 7793413 8205123 8407833 ...
## $ CGI_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ GPS_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ STDCGI_ORDER : num 11 11 11 11 11 11 11 11 11 11 ...
## $ LSTUDY : chr "ADCRLPRADI" "ADCRLPRADI" "ADFAMPRADI" "ADCRLPRADI" ...
## $ DB_OWNER : chr "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
## $ STUDY : chr "ALZ" "ALZ" "ALZ" "ALZ" ...
## $ SUBSTUDY : chr "ADCRLPRADI" "ADCRLPRADI" "ADFAMPRADI" "ADCRLPRADI" ...
## $ CENTER : chr "IHG" "IHG" "IHG" "IHG" ...
## $ GP : num 87883 88301 87650 88452 104540 ...
## $ IND : num 1 1 9000 1 1 106 9000 1 1 1 ...
## $ REFCTR : logi NA NA NA NA NA NA ...
## $ EXAM_DATE : POSIXct, format: "2024-02-14" "2024-02-13" ...
## $ EXAMINER : chr "gsv32" "jjs2031" "gsv32" "gsv32" ...
## $ DATE_OF_BIRTH : POSIXct, format: "1939-03-20" "1947-05-13" ...
## $ AGE_AT_EXAM : num 84 76 68 81 86 66 56 79 79 77 ...
## $ REVIEW_DATE : logi NA NA NA NA NA NA ...
## $ REVIEWER : logi NA NA NA NA NA NA ...
## $ DRSD_I : num 1 1 1 1 1 1 1 1 1 1 ...
## $ DRSD_II : num 1 1 1 1 1 0 1 1 1 1 ...
## $ DRSD_III : num 0 1 0 1 1 0 1 1 0 1 ...
## $ DRSD_IV : num 1 1 1 0 1 1 0 1 0 1 ...
## $ DRSD_V : num 1 1 1 0 1 0 0 1 0 1 ...
## $ DRSD_VI : num 0 0 0 0 0 0 0 1 0 1 ...
## $ DRSD_VII : num 0 1 0 0 0 1 0 0 0 1 ...
## $ DRSD_VIII : num 0 1 0 0 0 1 0 0 0 1 ...
## $ DRSD_IX : num 0 0 0 0 0 1 0 0 0 1 ...
## $ DRSD_X : num 1 0 1 1 0 0 0 1 1 1 ...
## $ DRSD_XI : num 1 0 1 0 0 0 0 1 1 1 ...
## $ DRSD_XII : num 1 0 0 0 0 0 0 1 1 1 ...
## $ COMMENTS_DRSD : chr "did not remember chevron figure" NA NA "unable to remember figures: triangle with stem and chevron" ...
## $ STATUS_DRSD : logi NA NA NA NA NA NA ...
## $ TOTAL_SCORE_ITEM1_DRSD : num 2 3 2 3 3 1 3 3 2 3 ...
## $ TOTAL_SCORE_ITEM1_DRSD_STATUS: logi NA NA NA NA NA NA ...
## $ TOTAL_SCORE_ITEM2_DRSD : num 2 2 2 0 2 1 0 3 0 3 ...
## $ TOTAL_SCORE_ITEM2_DRSD_STATUS: logi NA NA NA NA NA NA ...
## $ TOTAL_SCORE_ITEM3_DRSD : num 0 2 0 0 0 3 0 0 0 3 ...
## $ TOTAL_SCORE_ITEM3_DRSD_STATUS: logi NA NA NA NA NA NA ...
## $ TOTAL_SCORE_ITEM4_DRSD : num 3 0 2 1 0 0 0 3 3 3 ...
## $ TOTAL_SCORE_ITEM4_DRSD_STATUS: logi NA NA NA NA NA NA ...
## $ SUM_TOTAL_SCORE_DRSD : num 7 7 6 4 5 5 3 9 5 12 ...
## $ SUM_TOTAL_SCORE_DRSD_STATUS : chr NA NA NA NA ...
dfDD <- read_excel(revisedDDpath, sheet = "ALZ_STICK_D_RC")
## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]
## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 8 × 2
## VarNames `Data Type`
## <chr> <chr>
## 1 REFCTR VARCHAR2(6)
## 2 REVIEW_DATE date
## 3 REVIEWER VARCHAR
## 4 STATUS_DRSD NUMBER(3)
## 5 TOTAL_SCORE_ITEM1_DRSD_STATUS CHAR
## 6 TOTAL_SCORE_ITEM2_DRSD_STATUS CHAR
## 7 TOTAL_SCORE_ITEM3_DRSD_STATUS CHAR
## 8 TOTAL_SCORE_ITEM4_DRSD_STATUS CHAR
## select the vars to be converted to numeric
convert2num <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("NUMBER", dfDD$`Data Type`)] ## STATUS_DRSD
## select the vars to be converted to date
convert2date <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date",dfDD$`Data Type`,ignore.case = T)] ## REVIEW_DATE
## the rest should be converted to character
convert2chr <- setdiff(logicols,c(convert2num,convert2date))
## convert
df[convert2num] <- lapply(df[convert2num], as.numeric)
df[convert2date] <- lapply(df[convert2date], as.Date)
df[convert2chr] <- lapply(df[convert2chr], as.character)
## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "POSIXct" "POSIXt"
##
## [[4]]
## [1] "Date"
## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE" "DATE_OF_BIRTH"
## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]
## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## [1] "REVIEW_DATE" can ignore REVIEW_DATE, as it has been corrected in previous step
## [1] "REVIEW_DATE"
head(df[,datecols])
## EXAM_DATE DATE_OF_BIRTH
## 1 2024-02-14 1939-03-20
## 2 2024-02-13 1947-05-13
## 3 2023-10-24 1954-10-29
## 4 2024-02-15 1942-09-30
## 5 2023-09-13 1937-08-13
## 6 2024-02-20 1957-08-05
## convert format
df[datecols] <- lapply(df[datecols], as.Date)
## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric" "character" "Date"
## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 14 vars
## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]
mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)
## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore EXAMINER
## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 29 vars
## extract numeric variables from DD
## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]
mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)
unique(numColsfromDD$`Valid Responses`)
## [1] NA "1 thru 99999;"
## [3] "1 thru 9999;" "1;\r\n0;\r\n"
## [5] "995;\r\n996;\r\n997;\r\n998;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP
ALZ_STICK_D_RC <- df
df <- B4_CDR_RC
info(B4_CDR_RC,"SYSIND")
## #obs:599, cols:38, inds:592
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "logical"
##
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
## 'data.frame': 599 obs. of 38 variables:
## $ SYSXM : num 8275843 8276023 8276613 8258933 8259053 ...
## $ SYSIND : num 11160523 11620763 11369703 11369813 11037673 ...
## $ SYSGP : num 7923793 8005723 7951913 7952013 7894423 ...
## $ SYSGPSTUDY : num 1361903 1452433 1397023 1397123 1309743 ...
## $ SYSINDGP : num 7923633 8389833 8138973 8139083 7793413 ...
## $ CGI_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ GPS_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ STDCGI_ORDER : num 11 11 11 11 11 11 11 11 11 11 ...
## $ LSTUDY : chr "ADCRLPRADI" "ADCONTROL" "ADCRLPRADI" "ADCRLPRADI" ...
## $ DB_OWNER : chr "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
## $ STUDY : chr "ALZ" "ALZ" "ALZ" "ALZ" ...
## $ SUBSTUDY : chr "ADCRLPRADI" "ADCONTROL" "ADCRLPRADI" "ADCRLPRADI" ...
## $ CENTER : chr "IHG" "IHG" "IHG" "IHG" ...
## $ GP : num 87883 104457 88299 88301 87650 ...
## $ IND : num 1 1 1 1 9000 1 1 1 1 106 ...
## $ REFCTR : logi NA NA NA NA NA NA ...
## $ EXAM_DATE : POSIXct, format: "2024-02-14" "2023-04-17" ...
## $ EXAMINER : chr "gsv32" "sjt82" "gsv32" "jjs2031" ...
## $ DATE_OF_BIRTH : POSIXct, format: "1939-03-20" "1946-12-19" ...
## $ AGE_AT_EXAM : num 84 76 79 76 68 73 81 86 86 66 ...
## $ REVIEW_DATE : logi NA NA NA NA NA NA ...
## $ REVIEWER : logi NA NA NA NA NA NA ...
## $ METHOD_CDR : chr "IP" "IP" "IP" "IP" ...
## $ MEMO_NOTE : chr NA NA NA NA ...
## $ MEMO_SC : num 0.5 1 0.5 0 0 0 0 0.5 0 1 ...
## $ ORIENT_NOTE : chr NA NA NA NA ...
## $ ORIENT_SC : num 0 0.5 0 0 0 0 0 0 0 1 ...
## $ P_SOLVE_NOTE : chr NA NA NA NA ...
## $ P_SOLVE_SC : num 0 1 0 0 0 0 0 0 0 1 ...
## $ COM_AFFAIR_NOTE : chr NA NA NA NA ...
## $ COM_AFFAIR_SC : num 0 1 0 0 0 0 0 0 0 0.5 ...
## $ HOME_HOB_NOTES : chr NA NA NA NA ...
## $ HOME_HOB_SC : num 0 1 0 0 0 0 0 0 0 0.5 ...
## $ P_CARE_NOTE : chr NA NA NA NA ...
## $ P_CARE_SC : num 0 1 0 0 0 0 0 0 0 0.5 ...
## $ CDR_TOTAL_CDR : num 5 1 5 0 0 0 0 5 0 5 ...
## $ SUM_BOXSCORE : num 0.5 5.5 0.5 0 0 0 0 0.5 0 4.5 ...
## $ SUM_BOXSCORE_STATUS: chr NA NA NA NA ...
dfDD <- read_excel(revisedDDpath, sheet = "B4_CDR_RC")
## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]
## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 3 × 2
## VarNames `Data Type`
## <chr> <chr>
## 1 REFCTR VARCHAR2(6)
## 2 REVIEW_DATE date
## 3 REVIEWER VARCHAR
## select the vars to be converted to date
convert2date <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date",dfDD$`Data Type`,ignore.case = T)] ## REVIEW_DATE
## the rest should be converted to character
convert2chr <- setdiff(logicols,c(convert2num,convert2date)) ## [1] "REFCTR" "REVIEWER"
## convert
df[convert2date] <- lapply(df[convert2date], as.Date)
df[convert2chr] <- lapply(df[convert2chr], as.character)
## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "POSIXct" "POSIXt"
##
## [[4]]
## [1] "Date"
## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE" "DATE_OF_BIRTH"
## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]
## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## [1] "REVIEW_DATE" can ignore REVIEW_DATE, as it has been corrected in previous step
## [1] "REVIEW_DATE"
head(df[,datecols])
## EXAM_DATE DATE_OF_BIRTH
## 1 2024-02-14 1939-03-20
## 2 2023-04-17 1946-12-19
## 3 2024-02-13 1944-09-22
## 4 2024-02-13 1947-05-13
## 5 2023-10-24 1954-10-29
## 6 2023-05-15 1950-04-02
## convert format
df[datecols] <- lapply(df[datecols], as.Date)
## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric" "character" "Date"
## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 16 vars
## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]
mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)
## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore EXAMINER
## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 19 vars
## extract numeric variables from DD
## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]
mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)
unique(numColsfromDD$`Valid Responses`)
## [1] NA "1 thru 99999;"
## [3] "1 thru 9999;" "0;\r\n0.5;\r\n1;\r\n2;\r\n3;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP
B4_CDR_RC <- df
df <- B5_NPIQ_RC
info(B5_NPIQ_RC,"SYSIND")
## #obs:305, cols:38, inds:304
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "logical"
##
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
## 'data.frame': 305 obs. of 38 variables:
## $ SYSXM : num 8275943 8258983 8277543 8260623 8261293 ...
## $ SYSIND : num 11160523 11369813 11620763 11163453 11638403 ...
## $ SYSGP : num 7923793 7952013 8005723 7924953 8006953 ...
## $ SYSGPSTUDY : num 1361903 1397123 1452433 1363063 1453663 ...
## $ SYSINDGP : num 7923633 8139083 8389833 7926663 8407473 ...
## $ CGI_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ GPS_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ STDCGI_ORDER : num 11 11 11 11 11 11 11 11 11 11 ...
## $ LSTUDY : chr "ADCRLPRADI" "ADCRLPRADI" "ADCONTROL" "ADFAMPRADI" ...
## $ DB_OWNER : chr "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
## $ STUDY : chr "ALZ" "ALZ" "ALZ" "ALZ" ...
## $ SUBSTUDY : chr "ADCRLPRADI" "ADCRLPRADI" "ADCONTROL" "ADFAMPRADI" ...
## $ CENTER : chr "IHG" "IHG" "IHG" "IHG" ...
## $ GP : num 87883 88301 104457 87923 104556 ...
## $ IND : num 1 1 1 9000 1 1 1 1 1 101 ...
## $ REFCTR : logi NA NA NA NA NA NA ...
## $ EXAM_DATE : POSIXct, format: "2024-02-14" "2024-02-13" ...
## $ EXAMINER : chr "gsv32" "jjs2031" "sjt82" "gsv32" ...
## $ DATE_OF_BIRTH: POSIXct, format: "1939-03-20" "1947-05-13" ...
## $ AGE_AT_EXAM : num 84 76 76 56 79 71 74 64 86 70 ...
## $ REVIEW_DATE : logi NA NA NA NA NA NA ...
## $ REVIEWER : logi NA NA NA NA NA NA ...
## $ NPIQINF : num 1 2 3 3 2 2 3 1 3 3 ...
## $ NPIQINF_OTH : chr NA NA "center caretaker" "cousin" ...
## $ NPIQTYPE : num 1 NA NA 1 NA NA 1 NA NA 1 ...
## $ DELSEV : num 0 0 2 0 0 0 0 0 0 0 ...
## $ HALLSEV : num 1 0 2 0 0 0 0 0 0 0 ...
## $ AGITSEV : num 0 0 0 0 0 2 0 0 1 0 ...
## $ DEPDSEV : num 1 0 0 1 1 2 0 0 0 0 ...
## $ ANXSEV : num 1 0 0 0 2 2 0 0 1 0 ...
## $ ELATSEV : num 0 0 0 0 0 2 0 0 1 0 ...
## $ APASEV : num 0 0 0 0 0 3 0 0 0 0 ...
## $ DISNSEV : num 0 0 0 0 0 2 0 0 0 0 ...
## $ IRRSEV : num 0 0 0 1 0 2 0 0 0 0 ...
## $ MOTSEV : num 0 0 0 0 0 2 0 0 0 0 ...
## $ NITESEV : num 1 0 0 0 0 1 0 0 0 0 ...
## $ APPSEV : num 0 0 0 0 0 0 0 0 0 0 ...
## $ NOTES_NPIQ : chr NA NA NA NA ...
dfDD <- read_excel(revisedDDpath, sheet = "B5_NPIQ_RC")
## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]
## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 3 × 2
## VarNames `Data Type`
## <chr> <chr>
## 1 REFCTR VARCHAR2(6)
## 2 REVIEW_DATE date
## 3 REVIEWER CHAR
## converted to character
convert2chr <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("CHAR", dfDD$`Data Type`)]
convert2date <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date", dfDD$`Data Type`)]
## convert
df[convert2chr] <- lapply(df[convert2chr], as.character)
df[convert2date] <- lapply(df[convert2date], as.Date)
## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "POSIXct" "POSIXt"
##
## [[4]]
## [1] "Date"
## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE" "DATE_OF_BIRTH"
## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]
## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## REVIEW_DATE, ignore it, it has been corrected in previous step
## [1] "REVIEW_DATE"
head(df[,datecols])
## EXAM_DATE DATE_OF_BIRTH
## 1 2024-02-14 1939-03-20
## 2 2024-02-13 1947-05-13
## 3 2023-04-17 1946-12-19
## 4 2023-10-25 1967-06-15
## 5 2023-09-12 1944-04-17
## 6 2023-09-12 1952-04-25
## convert format
df[datecols] <- lapply(df[datecols], as.Date)
## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric" "character" "Date"
## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 10 vars
## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]
mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)
## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore EXAMINER
## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 25 vars
## extract numeric variables from DD
## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]
mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)
unique(numColsfromDD$`Valid Responses`)
## [1] NA
## [2] "1 thru 99999;"
## [3] "1 thru 9999;"
## [4] "1;\r\n2;\r\n3;\r\n4;\r\n5;\r\n6;\r\n7;"
## [5] "1;\r\n2;"
## [6] "1;\r\n2;\r\n3;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP
B5_NPIQ_RC <- df
df <- B6_GDS_RC
info(B6_GDS_RC,"SYSIND")
## #obs:543, cols:39, inds:539
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "logical"
##
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
## 'data.frame': 543 obs. of 39 variables:
## $ SYSXM : num 8276623 8258953 8259103 8277723 8277863 ...
## $ SYSIND : num 11369703 11369813 11037673 11435853 11638763 ...
## $ SYSGP : num 7951913 7952013 7894423 7962813 8007323 ...
## $ SYSGPSTUDY : num 1397023 1397123 1309743 1407923 1454033 ...
## $ SYSINDGP : num 8138973 8139083 7793413 8205123 8407833 ...
## $ CGI_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ GPS_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ STDCGI_ORDER : num 11 11 11 11 11 11 11 11 11 11 ...
## $ LSTUDY : chr "ADCRLPRADI" "ADCRLPRADI" "ADFAMPRADI" "ADCRLPRADI" ...
## $ DB_OWNER : chr "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
## $ STUDY : chr "ALZ" "ALZ" "ALZ" "ALZ" ...
## $ SUBSTUDY : chr "ADCRLPRADI" "ADCRLPRADI" "ADFAMPRADI" "ADCRLPRADI" ...
## $ CENTER : chr "IHG" "IHG" "IHG" "IHG" ...
## $ GP : num 88299 88301 87650 88452 104540 ...
## $ IND : num 1 1 9000 1 1 1 1 106 9000 1 ...
## $ REFCTR : logi NA NA NA NA NA NA ...
## $ EXAM_DATE : POSIXct, format: "2024-02-13" "2024-02-13" ...
## $ EXAMINER : chr "gsv32" "jjs2031" "gsv32" "gsv32" ...
## $ DATE_OF_BIRTH : POSIXct, format: "1944-09-22" "1947-05-13" ...
## $ AGE_AT_EXAM : num 79 76 68 81 86 73 86 66 56 73 ...
## $ REVIEW_DATE : logi NA NA NA NA NA NA ...
## $ REVIEWER : logi NA NA NA NA NA NA ...
## $ LIFE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ACTIVITY : num 0 0 0 0 1 0 0 0 0 0 ...
## $ EMPTY : num 0 0 0 1 0 0 0 0 0 0 ...
## $ BORED : num 0 0 0 0 1 0 0 0 0 0 ...
## $ SPIRIT : num 0 0 0 0 1 0 0 0 0 0 ...
## $ AFRAID : num 0 0 1 0 0 0 0 0 0 1 ...
## $ HAPPY : num 0 0 0 0 1 0 0 1 0 0 ...
## $ HELPLESS : num 0 0 0 0 0 0 0 0 0 0 ...
## $ STAY_HOME : num 0 0 0 1 0 0 0 0 0 0 ...
## $ MEMORY : num 0 0 0 0 1 0 0 0 0 0 ...
## $ ALIVE : num 0 0 0 0 0 0 0 0 0 0 ...
## $ WORTHLESS : num 0 0 0 0 1 0 0 0 0 0 ...
## $ ENERGY : num 0 0 0 0 1 0 0 0 0 0 ...
## $ HOPELESS : num 0 0 0 0 1 0 0 0 0 0 ...
## $ BETTER_OFF : num 0 0 0 1 0 0 0 0 0 0 ...
## $ INCOMPLETE_GDS: num NA NA NA NA NA NA NA NA NA NA ...
## $ COMMENTS_GDS : chr NA NA NA NA ...
dfDD <- read_excel(revisedDDpath, sheet = "B6_GDS_RC")
## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]
## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 3 × 2
## VarNames `Data Type`
## <chr> <chr>
## 1 REFCTR VARCHAR2(6)
## 2 REVIEW_DATE date
## 3 REVIEWER CHAR
## converted to character
convert2chr <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("CHAR", dfDD$`Data Type`,ignore.case = T)]
convert2date <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("DATE", dfDD$`Data Type`,ignore.case = T)]
## convert
df[convert2chr] <- lapply(df[convert2chr], as.character)
df[convert2date] <- lapply(df[convert2date], as.Date)
## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "POSIXct" "POSIXt"
##
## [[4]]
## [1] "Date"
## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE" "DATE_OF_BIRTH"
## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]
## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols)
## [1] "REVIEW_DATE"
# [1] "REVIEW_DATE", ignore it, this variables have been corrected in previous step
head(df[,datecols])
## EXAM_DATE DATE_OF_BIRTH
## 1 2024-02-13 1944-09-22
## 2 2024-02-13 1947-05-13
## 3 2023-10-24 1954-10-29
## 4 2024-02-15 1942-09-30
## 5 2023-09-13 1937-08-13
## 6 2023-05-15 1950-04-02
## convert format
df[datecols] <- lapply(df[datecols], as.Date)
## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric" "character" "Date"
## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 9 vars
## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]
mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)
## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore EXAMINER
## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 27 vars
## extract numeric variables from DD
## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]
mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)
unique(numColsfromDD$`Valid Responses`)
## [1] NA "1 thru 99999;" "1 thru 9999;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP
B6_GDS_RC <- df
df <- B7_FAS_RC
info(B7_FAS_RC,"SYSIND")
## #obs:435, cols:33, inds:431
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "logical"
##
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
## 'data.frame': 435 obs. of 33 variables:
## $ SYSXM : num 8275913 8275953 8258973 8259133 8277373 ...
## $ SYSIND : num 11620433 11160523 11369813 11037673 11620763 ...
## $ SYSGP : num 8005513 7923793 7952013 7894423 8005723 ...
## $ SYSGPSTUDY : num 1452223 1361903 1397123 1309743 1452433 ...
## $ SYSINDGP : num 8389503 7923633 8139083 7793413 8389833 ...
## $ CGI_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ GPS_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ STDCGI_ORDER : num 11 11 11 11 11 11 11 11 11 11 ...
## $ LSTUDY : chr "ADCONTROL" "ADCRLPRADI" "ADCRLPRADI" "ADFAMPRADI" ...
## $ DB_OWNER : chr "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
## $ STUDY : chr "ALZ" "ALZ" "ALZ" "ALZ" ...
## $ SUBSTUDY : chr "ADCONTROL" "ADCRLPRADI" "ADCRLPRADI" "ADFAMPRADI" ...
## $ CENTER : chr "IHG" "IHG" "IHG" "IHG" ...
## $ GP : num 104507 87883 88301 87650 104457 ...
## $ IND : num 1 1 1 9000 1 1 1 1 106 9000 ...
## $ REFCTR : logi NA NA NA NA NA NA ...
## $ EXAM_DATE : POSIXct, format: "2023-08-09" "2024-02-14" ...
## $ EXAMINER : chr "jjs2031" "gsv32" "jjs2031" "gsv32" ...
## $ DATE_OF_BIRTH: POSIXct, format: "1944-06-21" "1939-03-20" ...
## $ AGE_AT_EXAM : num 79 84 76 68 76 81 73 86 66 56 ...
## $ REVIEW_DATE : logi NA NA NA NA NA NA ...
## $ REVIEWER : logi NA NA NA NA NA NA ...
## $ FAQ1 : num 0 8 0 8 0 0 0 0 0 0 ...
## $ FAQ2 : num 0 0 0 8 1 0 0 0 0 0 ...
## $ FAQ3 : num 0 0 0 8 1 0 0 0 0 0 ...
## $ FAQ4 : num 0 0 0 8 1 0 0 0 0 0 ...
## $ FAQ5 : num 0 0 0 8 1 0 0 0 1 0 ...
## $ FAQ6 : num 0 0 0 8 1 0 0 0 1 0 ...
## $ FAQ7 : num 0 0 0 8 1 0 0 0 0 0 ...
## $ FAQ8 : num 0 0 0 8 1 0 0 0 0 0 ...
## $ FAQ9 : num 0 0 0 8 1 0 0 0 0 0 ...
## $ FAQ10 : num 0 2 0 8 1 0 0 0 1 0 ...
## $ NOTES_B7FAS : chr NA NA NA NA ...
dfDD <- read_excel(revisedDDpath, sheet = "B7_FAS_RC")
## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]
## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 3 × 2
## VarNames `Data Type`
## <chr> <chr>
## 1 REFCTR VARCHAR2(6)
## 2 REVIEW_DATE date
## 3 REVIEWER CHAR
## converted to character
convert2chr <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("CHAR", dfDD$`Data Type`,ignore.case = T)]
convert2date <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date", dfDD$`Data Type`,ignore.case = T)]
## convert
df[convert2chr] <- lapply(df[convert2chr], as.character)
df[convert2date] <- lapply(df[convert2date], as.Date)
## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "POSIXct" "POSIXt"
##
## [[4]]
## [1] "Date"
## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE" "DATE_OF_BIRTH"
## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]
## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## REVIEW_DATE, ignore it, it has been corrected in previous step
## [1] "REVIEW_DATE"
head(df[,datecols])
## EXAM_DATE DATE_OF_BIRTH
## 1 2023-08-09 1944-06-21
## 2 2024-02-14 1939-03-20
## 3 2024-02-13 1947-05-13
## 4 2023-10-24 1954-10-29
## 5 2023-04-17 1946-12-19
## 6 2024-02-15 1942-09-30
## convert format
df[datecols] <- lapply(df[datecols], as.Date)
## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric" "character" "Date"
## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 9 vars
## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]
mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)
## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore EXAMINER
## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 21 vars
## extract numeric variables from DD
## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]
mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)
unique(numColsfromDD$`Valid Responses`)
## [1] NA "1 thru 99999;"
## [3] "1 thru 9999;" "0;\r\n1;\r\n2;\r\n3;\r\n8;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP
B7_FAS_RC<- df
df <- BCF_RECOG_RC
info(BCF_RECOG_RC,"SYSIND")
## #obs:266, cols:24, inds:266
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "logical"
##
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
## 'data.frame': 266 obs. of 24 variables:
## $ SYSXM : num 8275963 8260183 8260813 8262253 8262463 ...
## $ SYSIND : num 11620763 11620563 11621203 11638453 11638463 ...
## $ SYSGP : num 8005723 8005633 8006163 8007003 8007013 ...
## $ SYSGPSTUDY : num 1452433 1452343 1452873 1453713 1453723 ...
## $ SYSINDGP : num 8389833 8389633 8390273 8407523 8407533 ...
## $ CGI_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ GPS_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ STDCGI_ORDER : num 11 11 11 11 11 11 11 11 11 11 ...
## $ LSTUDY : chr "ADCONTROL" "ADCONTROL" "ADCONTROL" "ADCONTROL" ...
## $ DB_OWNER : chr "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
## $ STUDY : chr "ALZ" "ALZ" "ALZ" "ALZ" ...
## $ SUBSTUDY : chr "ADCONTROL" "ADCONTROL" "ADCONTROL" "ADCONTROL" ...
## $ CENTER : chr "IHG" "IHG" "IHG" "IHG" ...
## $ GP : num 104457 104477 104455 104549 104548 ...
## $ IND : num 1 1 1 1 1 1 1 1 1 1 ...
## $ REFCTR : logi NA NA NA NA NA NA ...
## $ EXAM_DATE : POSIXct, format: "2023-04-17" "2023-05-15" ...
## $ EXAMINER : chr "sjt82" "jjs2031" "jjs2031" "jjs2031" ...
## $ DATE_OF_BIRTH : POSIXct, format: "1946-12-19" "1949-12-01" ...
## $ AGE_AT_EXAM : num 76 73 81 74 80 74 73 70 81 91 ...
## $ REVIEW_DATE : logi NA NA NA NA NA NA ...
## $ REVIEWER : logi NA NA NA NA NA NA ...
## $ CBF_RECOGNIZE_STIMULUS: num 0 0 1 1 1 1 0 1 1 1 ...
## $ COMMENTS_BCFRECOGN : chr NA NA NA NA ...
dfDD <- read_excel(revisedDDpath, sheet = "BCF_RECOG_RC")
## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]
## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 3 × 2
## VarNames `Data Type`
## <chr> <chr>
## 1 REFCTR VARCHAR2(6)
## 2 REVIEW_DATE date
## 3 REVIEWER CHAR
## select the vars to be converted to date
convert2date <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date",dfDD$`Data Type`,ignore.case = T)] ## REVIEW_DATE
## the rest should be converted to character
convert2chr <- setdiff(logicols,c(convert2num,convert2date)) ## [1] "REFCTR" "REVIEWER"
## convert
df[convert2date] <- lapply(df[convert2date], as.Date)
df[convert2chr] <- lapply(df[convert2chr], as.character)
## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "POSIXct" "POSIXt"
##
## [[4]]
## [1] "Date"
## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE" "DATE_OF_BIRTH"
## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]
## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## [1] "REVIEW_DATE" can ignore REVIEW_DATE, as it has been corrected in previous step
## [1] "REVIEW_DATE"
head(df[,datecols])
## EXAM_DATE DATE_OF_BIRTH
## 1 2023-04-17 1946-12-19
## 2 2023-05-15 1949-12-01
## 3 2023-02-24 1941-10-04
## 4 2023-09-11 1949-05-19
## 5 2023-09-11 1942-10-17
## 6 2023-02-23 1948-11-25
## convert format
df[datecols] <- lapply(df[datecols], as.Date)
## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric" "character" "Date"
## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 9 vars
## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]
mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)
## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore EXAMINER
## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 12 vars
## extract numeric variables from DD
## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]
mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)
unique(numColsfromDD$`Valid Responses`)
## [1] NA "1 thru 99999;" "1 thru 9999;" "0;\r\n1;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP
BCF_RECOG_RC <- df
df <- BCFCD_RC
info(BCFCD_RC,"SYSIND")
## #obs:269, cols:38, inds:269
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "logical"
##
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
## 'data.frame': 269 obs. of 38 variables:
## $ SYSXM : num 8275933 8260173 8260803 8262243 8262453 ...
## $ SYSIND : num 11620763 11620563 11621203 11638453 11638463 ...
## $ SYSGP : num 8005723 8005633 8006163 8007003 8007013 ...
## $ SYSGPSTUDY : num 1452433 1452343 1452873 1453713 1453723 ...
## $ SYSINDGP : num 8389833 8389633 8390273 8407523 8407533 ...
## $ CGI_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ GPS_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ STDCGI_ORDER : num 11 11 11 11 11 11 11 11 11 11 ...
## $ LSTUDY : chr "ADCONTROL" "ADCONTROL" "ADCONTROL" "ADCONTROL" ...
## $ DB_OWNER : chr "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
## $ STUDY : chr "ALZ" "ALZ" "ALZ" "ALZ" ...
## $ SUBSTUDY : chr "ADCONTROL" "ADCONTROL" "ADCONTROL" "ADCONTROL" ...
## $ CENTER : chr "IHG" "IHG" "IHG" "IHG" ...
## $ GP : num 104457 104477 104455 104549 104548 ...
## $ IND : num 1 1 1 1 1 1 1 105 1 1 ...
## $ REFCTR : logi NA NA NA NA NA NA ...
## $ EXAM_DATE : POSIXct, format: "2023-04-17" "2023-05-15" ...
## $ EXAMINER : chr "sjt82" "jjs2031" "jjs2031" "jjs2031" ...
## $ DATE_OF_BIRTH : POSIXct, format: "1946-12-19" "1949-12-01" ...
## $ AGE_AT_EXAM : num 76 73 81 74 80 74 73 63 70 81 ...
## $ REVIEW_DATE : logi NA NA NA NA NA NA ...
## $ REVIEWER : logi NA NA NA NA NA NA ...
## $ FOURSIDED_DELAY : num 0 2 1 2 1 2 0 2 2 2 ...
## $ STRAIGHT_LINES_DELAY : num 0 2 1 2 2 1 0 2 2 2 ...
## $ MIDDLETHIRD_DELAY : num 0 0 0 1 0 1 0 2 1 0 ...
## $ ROUND_DELAY : num 0 0 1 2 2 1 0 2 2 0 ...
## $ VERTICAL_LINES_DELAY : num 0 0 1 1 1 1 0 2 1 0 ...
## $ BELOW3_DELAY : num 0 2 1 1 1 1 0 2 1 1 ...
## $ VERTEX_DELAY : num 0 0 0 1 0 0 0 2 1 0 ...
## $ GAB87_DELAY : num 0 1 1 1 0 1 0 1 1 1 ...
## $ BONUS_DELAY : num 0 0 0 0 0 0 0 0 0 0 ...
## $ TIME_HOUR_DELAY : chr "01:00 PM" "10:23 AM" "01:19 PM" "12:21 PM" ...
## $ COMMENT_BCFDELAY : chr "Drew a landscape" NA NA NA ...
## $ FILE_NAME1 : chr NA NA NA NA ...
## $ TOTAL_SCORE_BENSON_DELAY : num 0 7 6 11 7 8 0 15 11 6 ...
## $ TOTAL_SCORE_BENSON_DEL_STATUS: logi NA NA NA NA NA NA ...
## $ PLUS_BONUS_DELAY : num 0 7 6 11 7 8 0 15 11 6 ...
## $ PLUS_BONUS_DELAY_STATUS : chr NA NA NA NA ...
dfDD <- read_excel(revisedDDpath, sheet = "BCFCD_RC")
## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]
## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 4 × 2
## VarNames `Data Type`
## <chr> <chr>
## 1 REFCTR VARCHAR2(6)
## 2 REVIEW_DATE date
## 3 REVIEWER CHAR
## 4 TOTAL_SCORE_BENSON_DEL_STATUS CHAR
## select the vars to be converted to date
convert2date <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date",dfDD$`Data Type`,ignore.case = T)] ## REVIEW_DATE
## the rest should be converted to character
convert2chr <- setdiff(logicols,c(convert2num,convert2date)) ## "REFCTR" "REVIEWER" "TOTAL_SCORE_BENSON_DEL_STATUS"
## convert
df[convert2date] <- lapply(df[convert2date], as.Date)
df[convert2chr] <- lapply(df[convert2chr], as.character)
## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "POSIXct" "POSIXt"
##
## [[4]]
## [1] "Date"
## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE" "DATE_OF_BIRTH"
## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]
## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## [1] "REVIEW_DATE" can ignore REVIEW_DATE, as it has been corrected in previous step
## [1] "REVIEW_DATE"
head(df[,datecols])
## EXAM_DATE DATE_OF_BIRTH
## 1 2023-04-17 1946-12-19
## 2 2023-05-15 1949-12-01
## 3 2023-02-24 1941-10-04
## 4 2023-09-11 1949-05-19
## 5 2023-09-11 1942-10-17
## 6 2023-02-23 1948-11-25
## convert format
df[datecols] <- lapply(df[datecols], as.Date)
## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric" "character" "Date"
## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 13 vars
## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]
mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)
## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore EXAMINER
## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 22 vars
## extract numeric variables from DD
## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]
mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)
unique(numColsfromDD$`Valid Responses`)
## [1] NA "1 thru 99999;" "1 thru 9999;" "0;\r\n1;\r\n2;"
## [5] "0;\r\n1;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP
BCFCD_RC <- df
df <- BCFCI_RC
info(BCFCI_RC,"SYSIND")
## #obs:270, cols:38, inds:270
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "logical"
##
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
## 'data.frame': 270 obs. of 38 variables:
## $ SYSXM : num 8260073 8260643 8260693 8261453 8278753 ...
## $ SYSIND : num 11620563 11621213 11621203 11621283 11617943 ...
## $ SYSGP : num 8005633 8006173 8006163 8006243 8005103 ...
## $ SYSGPSTUDY : num 1452343 1452883 1452873 1452953 1451813 ...
## $ SYSINDGP : num 8389633 8390283 8390273 8390353 8387013 ...
## $ CGI_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ GPS_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ STDCGI_ORDER : num 11 11 11 11 11 11 11 11 11 11 ...
## $ LSTUDY : chr "ADCONTROL" "ADCONTROL" "ADCONTROL" "ADCONTROL" ...
## $ DB_OWNER : chr "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
## $ STUDY : chr "ALZ" "ALZ" "ALZ" "ALZ" ...
## $ SUBSTUDY : chr "ADCONTROL" "ADCONTROL" "ADCONTROL" "ADCONTROL" ...
## $ CENTER : chr "IHG" "IHG" "IHG" "IHG" ...
## $ GP : num 104477 104456 104455 104471 104519 ...
## $ IND : num 1 1 1 1 1 1 1 105 1 1 ...
## $ REFCTR : logi NA NA NA NA NA NA ...
## $ EXAM_DATE : POSIXct, format: "2023-05-15" "2023-02-24" ...
## $ EXAMINER : chr "jjs2031" "jjs2031" "jjs2031" "gsv32" ...
## $ DATE_OF_BIRTH : POSIXct, format: "1949-12-01" "1949-06-10" ...
## $ AGE_AT_EXAM : num 73 73 81 67 67 74 80 63 73 81 ...
## $ REVIEW_DATE : logi NA NA NA NA NA NA ...
## $ REVIEWER : logi NA NA NA NA NA NA ...
## $ FOURSIDED : num 2 2 1 1 2 2 2 2 0 2 ...
## $ STRAIGHT_LINES : num 2 2 2 2 2 2 2 2 0 1 ...
## $ MIDDLETHIRD : num 2 2 2 1 2 2 2 2 0 2 ...
## $ ROUND : num 2 2 2 2 2 2 2 2 0 2 ...
## $ VERTICAL_LINES : num 2 2 2 1 2 2 2 2 0 2 ...
## $ BELOW3 : num 2 2 1 1 1 2 2 2 0 2 ...
## $ VERTEX : num 2 2 1 1 2 2 2 2 0 2 ...
## $ GAP87 : num 2 2 1 1 2 2 2 2 0 2 ...
## $ BONUS : num 1 1 0 0 0 1 1 1 0 0 ...
## $ TIME_HOUR_COPY : chr "10:13 AM" "10:44 AM" "01:08 PM" "11:15 AM" ...
## $ COMMENT_BCFCOPY : chr NA NA NA NA ...
## $ FILE_NAME1 : chr NA NA NA NA ...
## $ BCF_COPY_SCORE : num 16 16 12 10 15 16 16 16 0 15 ...
## $ BCF_COPY_SCORE_STATUS : logi NA NA NA NA NA NA ...
## $ TOTAL_SCORE_PLUS_BONUS : num 17 17 12 10 15 17 17 17 0 15 ...
## $ TOTAL_SCORE_PLUS_BONUS_STATUS: chr NA NA NA NA ...
dfDD <- read_excel(revisedDDpath, sheet = "BCFCI_RC")
## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]
## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 4 × 2
## VarNames `Data Type`
## <chr> <chr>
## 1 REFCTR VARCHAR2(6)
## 2 REVIEW_DATE date
## 3 REVIEWER CHAR
## 4 BCF_COPY_SCORE_STATUS CHAR
## select the vars to be converted to date
convert2date <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date",dfDD$`Data Type`,ignore.case = T)] ## REVIEW_DATE
## the rest should be converted to character
convert2chr <- setdiff(logicols,c(convert2num,convert2date)) ## [1] "REFCTR" "REVIEWER" "BCF_COPY_SCORE_STATUS"
## convert
df[convert2date] <- lapply(df[convert2date], as.Date)
df[convert2chr] <- lapply(df[convert2chr], as.character)
## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "POSIXct" "POSIXt"
##
## [[4]]
## [1] "Date"
## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE" "DATE_OF_BIRTH"
## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]
## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## REVIEW_DATE, ignore it, as it has been converted in previous step
## [1] "REVIEW_DATE"
head(df[,datecols])
## EXAM_DATE DATE_OF_BIRTH
## 1 2023-05-15 1949-12-01
## 2 2023-02-24 1949-06-10
## 3 2023-02-24 1941-10-04
## 4 2023-05-08 1956-04-15
## 5 2023-08-16 1956-01-09
## 6 2023-09-11 1949-05-19
## convert format
df[datecols] <- lapply(df[datecols], as.Date)
## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric" "character" "Date"
## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 13 vars
## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]
mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)
## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore EXAMINER
## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 22 vars
## extract numeric variables from DD
## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]
mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)
unique(numColsfromDD$`Valid Responses`)
## [1] NA "1 thru 99999;" "1 thru 9999;" "0;\r\n1;\r\n2;"
## [5] "0;\r\n1;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP
BCFCI_RC <- df
df <- BILINGUAL_SCALE_RC
info(BILINGUAL_SCALE_RC,"SYSIND")
## #obs:240, cols:90, inds:240
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "logical"
##
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
## 'data.frame': 240 obs. of 90 variables:
## $ SYSXM : num 8275903 8275993 8258743 8259043 8277793 ...
## $ SYSIND : num 11160523 11620433 11034403 11369813 11435853 ...
## $ SYSGP : num 7923793 8005513 7888823 7952013 7962813 ...
## $ SYSGPSTUDY : num 1361903 1452223 1304163 1397123 1407923 ...
## $ SYSINDGP : num 7923633 8389503 7790023 8139083 8205123 ...
## $ CGI_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ GPS_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ STDCGI_ORDER : num 11 11 11 11 11 11 11 11 11 11 ...
## $ LSTUDY : chr "ADCRLPRADI" "ADCONTROL" "ADFAMPRADI" "ADCRLPRADI" ...
## $ DB_OWNER : chr "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
## $ STUDY : chr "ALZ" "ALZ" "ALZ" "ALZ" ...
## $ SUBSTUDY : chr "ADCRLPRADI" "ADCONTROL" "ADFAMPRADI" "ADCRLPRADI" ...
## $ CENTER : chr "IHG" "IHG" "IHG" "IHG" ...
## $ GP : num 87883 104507 87556 88301 88452 ...
## $ IND : num 1 1 9001 1 1 ...
## $ REFCTR : logi NA NA NA NA NA NA ...
## $ EXAM_DATE : POSIXct, format: "2024-02-14" "2023-08-09" ...
## $ EXAMINER : chr "gsv32" "jjs2031" "jjs2031" "jjs2031" ...
## $ DATE_OF_BIRTH : POSIXct, format: "1939-03-20" "1944-06-21" ...
## $ AGE_AT_EXAM : num 84 79 68 76 81 73 86 66 81 79 ...
## $ REVIEW_DATE : logi NA NA NA NA NA NA ...
## $ REVIEWER : logi NA NA NA NA NA NA ...
## $ BILING_YEAR_EDU : num 6 12 14 14 9 20 12 12 14 7 ...
## $ BILING_LANG : chr "Spanish" "SPANISH" "SPANISH" "SPANISH" ...
## $ BILING_OTHER_LANG : num 0 1 1 0 1 0 0 1 0 0 ...
## $ BILINGUAL_LANG_YES1 : chr NA "ENGLISH" "SPANISH" NA ...
## $ BILINGUAL_LANG_YES2 : chr NA "SPANISH" "ENGLISH" NA ...
## $ BILINGUAL_LANG_YES3 : chr NA NA NA NA ...
## $ BILINGUAL_LANG_YES4 : logi NA NA NA NA NA NA ...
## $ BILINGUAL_REGION1 : chr NA NA NA NA ...
## $ BILINGUAL_REGION2 : chr NA NA NA NA ...
## $ BILINGUAL_REGION3 : chr NA NA NA NA ...
## $ BILINGUAL_REGION4 : logi NA NA NA NA NA NA ...
## $ BILINGUAL_LENGTH1 : chr NA NA NA NA ...
## $ BILINGUAL_LENGTH2 : chr NA NA NA NA ...
## $ BILINGUAL_LENGTH3 : chr NA NA NA NA ...
## $ BILINGUAL_LENGTH4 : logi NA NA NA NA NA NA ...
## $ BILINGUAL_LANG1 : chr NA "ENGLISH" "SPANISH" NA ...
## $ BILINGUAL_LANG2 : chr NA "SPANISH" "ENGLISH" NA ...
## $ BILINGUAL_LANG3 : chr NA NA NA NA ...
## $ BILINGUAL_LANG4 : logi NA NA NA NA NA NA ...
## $ BILINGUAL_FREQUENCY1: num NA 7 7 NA 7 NA NA NA NA NA ...
## $ BILINGUAL_FREQUENCY2: num NA 7 4 NA 7 NA NA NA NA NA ...
## $ BILINGUAL_FREQUENCY3: num NA NA NA NA NA NA NA NA NA NA ...
## $ BILINGUAL_FREQUENCY4: logi NA NA NA NA NA NA ...
## $ BILINGUAL_LEARN1 : chr NA "ENGLISH" "SPANISH" NA ...
## $ BILINGUAL_LEARN2 : chr NA "SPANISH" "ENGLISH" NA ...
## $ BILINGUAL_LEARN3 : chr NA NA NA NA ...
## $ BILINGUAL_LEARN4 : logi NA NA NA NA NA NA ...
## $ BILINGUAL_HOME1 : num NA NA 1 NA NA NA NA 1 NA NA ...
## $ BILINGUAL_HOME2 : num NA 1 NA NA NA NA NA NA NA NA ...
## $ BILINGUAL_HOME3 : num NA NA NA NA NA NA NA NA NA NA ...
## $ BILINGUAL_HOME4 : logi NA NA NA NA NA NA ...
## $ BILINGUAL_SCHOOL1 : num NA NA 1 NA 8 NA NA NA NA NA ...
## $ BILINGUAL_SCHOOL2 : num NA 1 NA NA NA NA NA 1 NA NA ...
## $ BILINGUAL_SCHOOL3 : num NA NA NA NA NA NA NA NA NA NA ...
## $ BILINGUAL_SCHOOL4 : logi NA NA NA NA NA NA ...
## $ BILINGUAL_MIGRAT1 : num NA 1 NA NA NA NA NA NA NA NA ...
## $ BILINGUAL_MIGRAT2 : num NA NA 1 NA 27 NA NA 1 NA NA ...
## $ BILINGUAL_MIGRAT3 : num NA NA NA NA NA NA NA NA NA NA ...
## $ BILINGUAL_MIGRAT4 : logi NA NA NA NA NA NA ...
## $ BILINGUAL_NONFORMAL1: num NA NA NA NA NA NA NA NA NA NA ...
## $ BILINGUAL_NONFORMAL2: num NA NA NA NA NA NA NA 1 NA NA ...
## $ BILINGUAL_NONFORMAL3: logi NA NA NA NA NA NA ...
## $ BILINGUAL_NONFORMAL4: logi NA NA NA NA NA NA ...
## $ BILINGUAL_OTHER1 : logi NA NA NA NA NA NA ...
## $ BILINGUAL_OTHER2 : logi NA NA NA NA NA NA ...
## $ BILINGUAL_OTHER3 : logi NA NA NA NA NA NA ...
## $ BILINGUAL_OTHER4 : logi NA NA NA NA NA NA ...
## $ BILINGUAL_RATE1 : chr NA "ENGLISH" "SPANISH" NA ...
## $ BILINGUAL_RATE2 : chr NA "SPANISH" "ENGLISH" NA ...
## $ BILINGUAL_RATE3 : chr NA NA NA NA ...
## $ BILINGUAL_RATE4 : logi NA NA NA NA NA NA ...
## $ BILINGUAL_READ1 : num NA 7 7 NA 7 NA NA 7 NA NA ...
## $ BILINGUAL_READ2 : num NA 7 4 NA 7 NA NA 7 NA NA ...
## $ BILINGUAL_READ3 : num NA NA NA NA NA NA NA NA NA NA ...
## $ BILINGUAL_READ4 : logi NA NA NA NA NA NA ...
## $ BILINGUAL_WRITE1 : num NA 7 7 NA 7 NA NA 7 NA NA ...
## $ BILINGUAL_WRITE2 : num NA 7 4 NA 7 NA NA 7 NA NA ...
## $ BILINGUAL_WRITE3 : num NA NA NA NA NA NA NA NA NA NA ...
## $ BILINGUAL_SPEAK1 : num NA 7 7 NA 7 NA NA 7 NA NA ...
## $ BILINGUAL_WRITE4 : logi NA NA NA NA NA NA ...
## $ BILINGUAL_SPEAK2 : num NA 7 4 NA 7 NA NA 7 NA NA ...
## $ BILINGUAL_SPEAK3 : num NA NA NA NA NA NA NA NA NA NA ...
## $ BILINGUAL_SPEAK4 : logi NA NA NA NA NA NA ...
## $ BILINGUAL_LISTEN1 : num NA 7 7 NA 7 NA NA 7 NA NA ...
## $ BILINGUAL_LISTEN2 : num NA 7 4 NA 7 NA NA 7 NA NA ...
## $ BILINGUAL_LISTEN3 : num NA NA NA NA NA NA NA NA NA NA ...
## $ BILINGUAL_LISTEN4 : logi NA NA NA NA NA NA ...
## $ BILINGUAL_TIME : num NA NA NA NA 20 NA NA NA NA NA ...
dfDD <- read_excel(revisedDDpath, sheet = "BILINGUAL_SCALE_RC")
## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)] ## 23 vars
## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 23 × 2
## VarNames `Data Type`
## <chr> <chr>
## 1 REFCTR VARCHAR2(6)
## 2 REVIEW_DATE date
## 3 REVIEWER CHAR
## 4 BILINGUAL_LANG_YES4 VARCHAR2(25)
## 5 BILINGUAL_REGION4 VARCHAR2(10)
## 6 BILINGUAL_LENGTH4 VARCHAR2(25)
## 7 BILINGUAL_LANG4 VARCHAR2(25)
## 8 BILINGUAL_FREQUENCY4 NUMBER(2)
## 9 BILINGUAL_LEARN4 VARCHAR2(25)
## 10 BILINGUAL_HOME4 NUMBER(2)
## # ℹ 13 more rows
## select the vars to be converted to numeric
convert2num <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("NUMBER", dfDD$`Data Type`)]
## select the vars to be converted to date
convert2date <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date",dfDD$`Data Type`,ignore.case = T)] ## REVIEW_DATE
## the rest should be converted to character
convert2chr <- setdiff(logicols,c(convert2num,convert2date))
## convert
df[convert2num] <- lapply(df[convert2num], as.numeric)
df[convert2date] <- lapply(df[convert2date], as.Date)
df[convert2chr] <- lapply(df[convert2chr], as.character)
## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "POSIXct" "POSIXt"
##
## [[4]]
## [1] "Date"
## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE" "DATE_OF_BIRTH"
## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]
## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## [1] "REVIEW_DATE" can ignore REVIEW_DATE, as it has been corrected in previous step
## [1] "REVIEW_DATE"
head(df[,datecols])
## EXAM_DATE DATE_OF_BIRTH
## 1 2024-02-14 1939-03-20
## 2 2023-08-09 1944-06-21
## 3 2023-06-22 1954-08-20
## 4 2024-02-13 1947-05-13
## 5 2024-02-15 1942-09-30
## 6 2023-05-15 1950-04-02
## convert format
df[datecols] <- lapply(df[datecols], as.Date)
## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric" "character" "Date"
## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 33 vars
## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]
mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)
## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore EXAMINER
## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 15 vars
## extract numeric variables from DD
## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]
mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)
unique(numColsfromDD$`Valid Responses`)
## [1] NA
## [2] "1 thru 99999;"
## [3] "1 thru 9999;"
## [4] "0;\r\n1;"
## [5] "1;\r\n2;\r\n3;\r\n4;\r\n5;\r\n6;\r\n7;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP
BILINGUAL_SCALE_RC <- df
df <- CAT_FLUENCY_RC
info(CAT_FLUENCY_RC,"SYSIND")
## #obs:555, cols:29, inds:550
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "logical"
##
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
## 'data.frame': 555 obs. of 29 variables:
## $ SYSXM : num 8276513 8258853 8258903 8260133 8277653 ...
## $ SYSIND : num 11369703 11369813 11037673 11620563 11435853 ...
## $ SYSGP : num 7951913 7952013 7894423 8005633 7962813 ...
## $ SYSGPSTUDY : num 1397023 1397123 1309743 1452343 1407923 ...
## $ SYSINDGP : num 8138973 8139083 7793413 8389633 8205123 ...
## $ CGI_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ GPS_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ STDCGI_ORDER : num 11 11 11 11 11 11 11 11 11 11 ...
## $ LSTUDY : chr "ADCRLPRADI" "ADCRLPRADI" "ADFAMPRADI" "ADCONTROL" ...
## $ DB_OWNER : chr "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
## $ STUDY : chr "ALZ" "ALZ" "ALZ" "ALZ" ...
## $ SUBSTUDY : chr "ADCRLPRADI" "ADCRLPRADI" "ADFAMPRADI" "ADCONTROL" ...
## $ CENTER : chr "IHG" "IHG" "IHG" "IHG" ...
## $ GP : num 88299 88301 87650 104477 88452 ...
## $ IND : num 1 1 9000 1 1 ...
## $ REFCTR : logi NA NA NA NA NA NA ...
## $ EXAM_DATE : POSIXct, format: "2024-02-13" "2024-02-13" ...
## $ EXAMINER : chr "gsv32" "jjs2031" "gsv32" "jjs2031" ...
## $ DATE_OF_BIRTH: POSIXct, format: "1944-09-22" "1947-05-13" ...
## $ AGE_AT_EXAM : num 79 76 68 73 81 86 81 73 60 79 ...
## $ REVIEW_DATE : logi NA NA NA NA NA NA ...
## $ REVIEWER : logi NA NA NA NA NA NA ...
## $ ANIM_ENTRY : chr "perro, gato, pajaritos, jirafa, cerditos, conejo, paloma, vaca, bueyes, hipopotamos, peces, aguila, avestruz, guinea" "ELEFANTE VACA CHIVE PERRO BUFALO CERDO TORO HORMISA" "perro, conejo, gato, gallina, elefante, caballo, paloma, gato, mono, leon, jirafa, lagartijo, raton, culebra" "DOG CAT BIRD LION CAMEL HORSE ZEBRA CHIT... MONKEY MULE DONKEY OSTRICH PARROT EAGLE Moj... RAT COCKROACH FISH SHARK SARDINE" ...
## $ ANIM_SCORE : num 14 8 13 20 19 12 20 17 25 11 ...
## $ ANIM_STATUS : num NA NA NA NA NA NA NA NA NA NA ...
## $ VEG_ENTRY : chr "tomate, lechuga, ganganbo, repollo, peti poas, calabaza" "BAFATA NAME YAUTIA PAPA MALAGA ZANCHORIA APIO HABICHUELA TERNIA MAIZ" "platano, yautia, name, chayote, pepinillo, remolacha, esparrago, repollo, lechuga, tomate, papa, habichuelas" "MALANGA PUMPKIN PLANTAIN YUCA CORN PEAR (X) PEACH (X) GRAPE (X) STRAWBERRY (X) Sapote (X) Mamey (X) WATERMELON"| __truncated__ ...
## $ VEG_SCORE : num 6 9 14 9 13 11 15 8 16 7 ...
## $ VEG_STATUS : num NA NA NA NA NA NA NA NA NA NA ...
## $ NOTE_CATEGORY: chr NA NA NA NA ...
dfDD <- read_excel(revisedDDpath, sheet = "CAT_FLUENCY_RC")
## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]
## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 3 × 2
## VarNames `Data Type`
## <chr> <chr>
## 1 REFCTR VARCHAR2(6)
## 2 REVIEW_DATE date
## 3 REVIEWER CHAR
## select the vars to be converted to date
convert2date <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date",dfDD$`Data Type`,ignore.case = T)] ## REVIEW_DATE
## the rest should be converted to character
convert2chr <- setdiff(logicols,c(convert2num,convert2date)) ## [1] "REFCTR" "REVIEWER"
## convert
df[convert2date] <- lapply(df[convert2date], as.Date)
df[convert2chr] <- lapply(df[convert2chr], as.character)
## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "POSIXct" "POSIXt"
##
## [[4]]
## [1] "Date"
## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE" "DATE_OF_BIRTH"
## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]
## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## [1] "REVIEW_DATE" can ignore REVIEW_DATE, as it has been corrected in previous step
## [1] "REVIEW_DATE"
head(df[,datecols])
## EXAM_DATE DATE_OF_BIRTH
## 1 2024-02-13 1944-09-22
## 2 2024-02-13 1947-05-13
## 3 2023-10-24 1954-10-29
## 4 2023-05-15 1949-12-01
## 5 2024-02-15 1942-09-30
## 6 2023-05-09 1936-05-22
## convert format
df[datecols] <- lapply(df[datecols], as.Date)
## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric" "character" "Date"
## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 11 vars
## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]
mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)
## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore EXAMINER
## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 15 vars
## extract numeric variables from DD
## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]
mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)
unique(numColsfromDD$`Valid Responses`)
## [1] NA "1 thru 99999;"
## [3] "1 thru 9999;" "995;\r\n996;\r\n997;\r\n998;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP
CAT_FLUENCY_RC <- df
df <- CERAD_DEL_RC
info(CERAD_DEL_RC,"SYSIND")
## #obs:177, cols:44, inds:177
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "logical"
##
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
## 'data.frame': 177 obs. of 44 variables:
## $ SYSXM : num 8275853 8260563 8278733 8264043 8264683 ...
## $ SYSIND : num 11160523 11163453 11618053 11620393 11617573 ...
## $ SYSGP : num 7923793 7924953 8005213 8005493 8004733 ...
## $ SYSGPSTUDY : num 1361903 1363063 1451923 1452203 1451443 ...
## $ SYSINDGP : num 7923633 7926663 8387123 8389463 8386643 ...
## $ CGI_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ GPS_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ STDCGI_ORDER : num 11 11 11 11 11 11 11 11 11 11 ...
## $ LSTUDY : chr "ADCRLPRADI" "ADFAMPRADI" "ADCONTROL" "ADCONTROL" ...
## $ DB_OWNER : chr "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
## $ STUDY : chr "ALZ" "ALZ" "ALZ" "ALZ" ...
## $ SUBSTUDY : chr "ADCRLPRADI" "ADFAMPRADI" "ADCONTROL" "ADCONTROL" ...
## $ CENTER : chr "IHG" "IHG" "IHG" "IHG" ...
## $ GP : num 87883 87923 104511 104500 104525 ...
## $ IND : num 1 9000 1 1 1 105 110 1 1 1 ...
## $ REFCTR : logi NA NA NA NA NA NA ...
## $ EXAM_DATE : POSIXct, format: "2024-02-14" "2023-10-25" ...
## $ EXAMINER : chr "gsv32" "gsv32" "jjs2031" "jjs2031" ...
## $ DATE_OF_BIRTH : POSIXct, format: "1939-03-20" "1967-06-15" ...
## $ AGE_AT_EXAM : num 84 56 77 70 91 63 65 64 76 81 ...
## $ REVIEW_DATE : logi NA NA NA NA NA NA ...
## $ REVIEWER : logi NA NA NA NA NA NA ...
## $ WLM_CRTA : num 1 0 1 1 1 0 0 1 1 1 ...
## $ WLM_CRTB : num 0 0 0 0 0 1 1 1 1 0 ...
## $ WLM_CRTC : num 0 1 0 0 0 0 0 1 1 1 ...
## $ WLM_CRTD : num 0 0 0 1 0 0 0 1 0 0 ...
## $ WLM_CRTE : num 0 0 1 1 1 1 1 1 0 1 ...
## $ WLM_CRTF : num 0 0 0 0 0 1 0 1 0 1 ...
## $ WLM_CRTG : num 0 1 0 1 0 1 0 0 1 1 ...
## $ WLM_CRTH : num 0 0 1 0 0 1 0 1 1 0 ...
## $ WLM_CRTI : num 0 0 0 1 0 0 0 1 0 1 ...
## $ WLM_CRTJ : num 0 0 0 0 0 0 0 0 0 0 ...
## $ WLM_INT1 : num 1 1 NA NA NA NA NA NA NA NA ...
## $ WLM_INT2 : num NA 1 NA NA NA NA NA NA NA NA ...
## $ WLM_INT3 : num NA NA NA NA NA NA NA NA NA NA ...
## $ WLM_INT4 : num NA NA NA NA NA NA NA NA NA NA ...
## $ WLM_INT5 : num NA NA NA NA NA NA NA NA NA NA ...
## $ NOTES_CERADRECALL : chr NA NA NA NA ...
## $ WLM_CRT : num 1 2 3 5 2 5 2 8 5 6 ...
## $ WLM_CRT_STATUS : chr NA NA NA NA ...
## $ WLM_INT : num 1 2 NA NA NA NA NA NA NA NA ...
## $ WLM_INT_STATUS : chr "partial" "partial" NA NA ...
## $ SCALES_CERADRECALL : chr "3" "4" "5" "7" ...
## $ SCALES_CERADRECALL_STATUS: chr NA NA NA NA ...
dfDD <- read_excel(revisedDDpath, sheet = "CERAD_DEL_RC")
## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]
## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 3 × 2
## VarNames `Data Type`
## <chr> <chr>
## 1 REFCTR VARCHAR2(6)
## 2 REVIEW_DATE date
## 3 REVIEWER CHAR
## select the vars to be converted to date
convert2date <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date",dfDD$`Data Type`,ignore.case = T)] ## REVIEW_DATE
## the rest should be converted to character
convert2chr <- setdiff(logicols,c(convert2num,convert2date)) ## [1] "REFCTR" "REVIEWER"
## convert
df[convert2date] <- lapply(df[convert2date], as.Date)
df[convert2chr] <- lapply(df[convert2chr], as.character)
## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "POSIXct" "POSIXt"
##
## [[4]]
## [1] "Date"
## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE" "DATE_OF_BIRTH"
## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]
## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## [1] "REVIEW_DATE" can ignore REVIEW_DATE, as it has been corrected in previous step
## [1] "REVIEW_DATE"
head(df[,datecols])
## EXAM_DATE DATE_OF_BIRTH
## 1 2024-02-14 1939-03-20
## 2 2023-10-25 1967-06-15
## 3 2023-08-11 1946-06-19
## 4 2023-08-14 1952-08-29
## 5 2023-08-18 1931-09-20
## 6 2023-06-19 1960-06-04
## convert format
df[datecols] <- lapply(df[datecols], as.Date)
## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric" "character" "Date"
## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 13 vars
## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]
mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## [1] SCALES_CERADRECALL
## SCALES_CERADRECALL shows numeric in DD, but read in as character
## the reason it pops up is because they use "na" to represent the NAs
## I will correct it and convert it to numeric
df$SCALES_CERADRECALL[df$SCALES_CERADRECALL == "na"] <- NA
unique(df$SCALES_CERADRECALL)
## [1] "3" "4" "5" "7" "11" "8" NA "10" "13"
df$SCALES_CERADRECALL <- as.numeric(df$SCALES_CERADRECALL)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)
## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore EXAMINER
## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 29 vars
## extract numeric variables from DD
## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]
mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)
unique(numColsfromDD$`Valid Responses`)
## [1] NA "1 thru 99999;" "1 thru 9999;" "1;\r\n0;"
## [5] "1;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP
CERAD_DEL_RC <- df
df <- CERAD_IMM_RC
info(CERAD_IMM_RC,"SYSIND")
## #obs:188, cols:88, inds:188
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "logical"
##
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
## 'data.frame': 188 obs. of 88 variables:
## $ SYSXM : num 8260413 8278683 8264003 8264323 8264633 ...
## $ SYSIND : num 11163453 11618053 11620393 11618173 11617573 ...
## $ SYSGP : num 7924953 8005213 8005493 8005333 8004733 ...
## $ SYSGPSTUDY : num 1363063 1451923 1452203 1452043 1451443 ...
## $ SYSINDGP : num 7926663 8387123 8389463 8387243 8386643 ...
## $ CGI_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ GPS_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ STDCGI_ORDER : num 11 11 11 11 11 11 11 11 11 11 ...
## $ LSTUDY : chr "ADFAMPRADI" "ADCONTROL" "ADCONTROL" "ADCONTROL" ...
## $ DB_OWNER : chr "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
## $ STUDY : chr "ALZ" "ALZ" "ALZ" "ALZ" ...
## $ SUBSTUDY : chr "ADFAMPRADI" "ADCONTROL" "ADCONTROL" "ADCONTROL" ...
## $ CENTER : chr "IHG" "IHG" "IHG" "IHG" ...
## $ GP : num 87923 104511 104500 104499 104525 ...
## $ IND : num 9000 1 1 1 1 105 110 1 1 1 ...
## $ REFCTR : logi NA NA NA NA NA NA ...
## $ EXAM_DATE : POSIXct, format: "2023-10-25" "2023-08-11" ...
## $ EXAMINER : chr "gsv32" "jjs2031" "jjs2031" "jjs2031" ...
## $ DATE_OF_BIRTH : POSIXct, format: "1967-06-15" "1946-06-19" ...
## $ AGE_AT_EXAM : num 56 77 70 81 91 63 65 69 76 83 ...
## $ REVIEW_DATE : logi NA NA NA NA NA NA ...
## $ REVIEWER : logi NA NA NA NA NA NA ...
## $ CERAD_PRESENTATION : num 1 2 2 1 2 1 1 2 1 2 ...
## $ WLM_1A : num 0 1 1 1 1 NA NA 1 1 1 ...
## $ WLM_1B : num 0 0 0 NA NA NA NA 1 NA 0 ...
## $ WLM_1C : num 0 1 0 1 NA NA 1 1 1 1 ...
## $ WLM_1D : num 0 0 0 NA NA NA NA NA NA 0 ...
## $ WLM_1E : num 0 0 0 NA NA 1 NA NA NA 0 ...
## $ WLM_1F : num 0 0 1 NA NA NA NA 1 NA 0 ...
## $ WLM_1G : num 0 0 0 NA NA NA NA 1 NA 0 ...
## $ WLM_1H : num 0 0 0 NA NA NA 1 NA NA 0 ...
## $ WLM_1I : num 0 0 0 1 NA 1 NA NA NA 0 ...
## $ WLM_1J : num 0 0 1 1 1 1 1 1 1 1 ...
## $ WLM_1INT1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ WLM_1INT2 : num NA NA NA NA NA NA NA NA NA NA ...
## $ WLM_1INT3 : logi NA NA NA NA NA NA ...
## $ WLM_1INT4 : logi NA NA NA NA NA NA ...
## $ WLM_1INT5 : logi NA NA NA NA NA NA ...
## $ WLM_1INT6 : logi NA NA NA NA NA NA ...
## $ WLM_2H : num 1 1 1 0 0 1 1 0 1 0 ...
## $ WLM_2F : num 0 0 1 1 0 1 1 1 0 0 ...
## $ WLM_2A : num 0 1 1 1 1 0 1 1 1 1 ...
## $ WLM_2C : num 1 0 0 1 1 1 0 0 1 1 ...
## $ WLM_2J : num 1 0 0 1 1 1 0 0 1 0 ...
## $ WLM_2B : num 1 0 0 1 0 1 0 1 1 1 ...
## $ WLM_2E : num 0 0 0 1 0 1 0 0 0 0 ...
## $ WLM_2D : num 1 1 0 0 0 0 1 0 0 0 ...
## $ WLM_2G : num 0 0 1 1 0 0 1 1 0 1 ...
## $ WLM_2I : num 1 0 1 1 0 1 1 0 1 1 ...
## $ WLM_2INT1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ WLM_2INT2 : num NA NA NA NA NA NA NA NA NA NA ...
## $ WLM_2INT3 : logi NA NA NA NA NA NA ...
## $ WLM_2INT4 : logi NA NA NA NA NA NA ...
## $ WLM_2INT5 : logi NA NA NA NA NA NA ...
## $ WLM_2INT6 : logi NA NA NA NA NA NA ...
## $ WLM_3E : num 1 1 1 1 1 1 1 1 1 0 ...
## $ WLM_3I : num 0 0 1 1 0 0 0 0 0 0 ...
## $ WLM_3B : num 1 1 0 1 0 0 1 1 1 0 ...
## $ WLM_3F : num 1 1 1 1 0 1 1 1 0 0 ...
## $ WLM_3G : num 1 0 1 1 0 1 0 1 1 0 ...
## $ WLM_3C : num 0 0 0 1 1 1 1 1 1 1 ...
## $ WLM_3A : num 0 1 1 1 1 1 1 1 1 0 ...
## $ WLM_3J : num 1 0 0 0 0 0 1 0 1 1 ...
## $ WLM_3H : num 1 0 1 0 1 1 0 1 1 1 ...
## $ WLM_3D : num 0 1 1 1 1 1 1 0 1 1 ...
## $ WLM_3INT1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ WLM_3INT2 : num NA NA NA NA NA NA NA NA NA NA ...
## $ WLM_3INT3 : logi NA NA NA NA NA NA ...
## $ WLM_3INT4 : logi NA NA NA NA NA NA ...
## $ WLM_3INT5 : logi NA NA NA NA NA NA ...
## $ WLM_3INT6 : logi NA NA NA NA NA NA ...
## $ COMMENTS_CERAD : chr NA NA NA NA ...
## $ WLM_1 : num 0 2 3 4 2 3 3 6 3 3 ...
## $ WLM_1_STATUS : chr NA NA NA "partial" ...
## $ WLM_1INT : num NA NA NA NA NA NA NA NA NA NA ...
## $ WLM_1INT_STATUS : chr NA NA NA NA ...
## $ WLM_2 : num 6 3 5 8 3 7 6 4 6 5 ...
## $ WLM_2_STATUS : chr NA NA NA NA ...
## $ WLM_2INT : num NA NA NA NA NA NA NA NA NA NA ...
## $ WLM_2INT_STATUS : chr NA NA NA NA ...
## $ WLM_3 : num 6 5 7 8 5 7 7 7 8 4 ...
## $ WLM_3_STATUS : chr NA NA NA NA ...
## $ WLM_3INT : num NA NA NA NA NA NA NA NA NA NA ...
## $ WLM_3INT_STATUS : chr NA NA NA NA ...
## $ RAWSCORE_CERAD : num 12 10 15 20 10 17 16 17 17 12 ...
## $ RAWSCORE_CERAD_STATUS : chr NA NA NA "partial" ...
## $ SCALESCORE_CERAD_2 : num 4 4 5 11 4 6 6 6 6 4 ...
## $ SCALESCORE_CERAD_2_STATUS: chr NA NA NA "partial" ...
dfDD <- read_excel(revisedDDpath, sheet = "CERAD_IMM_RC")
## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]
## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 15 × 2
## VarNames `Data Type`
## <chr> <chr>
## 1 REFCTR VARCHAR2(6)
## 2 REVIEW_DATE date
## 3 REVIEWER CHAR
## 4 WLM_1INT3 NUMBER(1)
## 5 WLM_1INT4 NUMBER(1)
## 6 WLM_1INT5 NUMBER(1)
## 7 WLM_1INT6 NUMBER(1)
## 8 WLM_2INT3 NUMBER(1)
## 9 WLM_2INT4 NUMBER(1)
## 10 WLM_2INT5 NUMBER(1)
## 11 WLM_2INT6 NUMBER(1)
## 12 WLM_3INT3 NUMBER(1)
## 13 WLM_3INT4 NUMBER(1)
## 14 WLM_3INT5 NUMBER(1)
## 15 WLM_3INT6 NUMBER(1)
## select the vars to be converted to numeric
convert2num <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("NUMBER", dfDD$`Data Type`)]
## select the vars to be converted to date
convert2date <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date",dfDD$`Data Type`,ignore.case = T)] ## REVIEW_DATE
## the rest should be converted to character
convert2chr <- setdiff(logicols,c(convert2num,convert2date)) ## [1] "REFCTR" "REVIEWER"
## convert
df[convert2num] <- lapply(df[convert2num], as.numeric)
df[convert2date] <- lapply(df[convert2date], as.Date)
df[convert2chr] <- lapply(df[convert2chr], as.character)
## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "POSIXct" "POSIXt"
##
## [[4]]
## [1] "Date"
## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE" "DATE_OF_BIRTH"
## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]
## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## [1] "REVIEW_DATE" can ignore REVIEW_DATE, as it has been corrected in previous step
## [1] "REVIEW_DATE"
head(df[,datecols])
## EXAM_DATE DATE_OF_BIRTH
## 1 2023-10-25 1967-06-15
## 2 2023-08-11 1946-06-19
## 3 2023-08-14 1952-08-29
## 4 2023-08-07 1941-09-10
## 5 2023-08-18 1931-09-20
## 6 2023-06-19 1960-06-04
## convert format
df[datecols] <- lapply(df[datecols], as.Date)
## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric" "character" "Date"
## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 17 vars
## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]
mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)
## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore EXAMINER
## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 68 vars
## extract numeric variables from DD
## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]
mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)
unique(numColsfromDD$`Valid Responses`)
## [1] NA "1 thru 99999;" "1 thru 9999;" "1;\r\n2;"
## [5] "1;\r\n0;" "1;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP
CERAD_IMM_RC <- df
df <- CERAD_RECOG_RC
info(CERAD_RECOG_RC,"SYSIND")
## #obs:177, cols:48, inds:177
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "logical"
##
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
## 'data.frame': 177 obs. of 48 variables:
## $ SYSXM : num 8275863 8260583 8278763 8264053 8264793 ...
## $ SYSIND : num 11160523 11163453 11618053 11620393 11617573 ...
## $ SYSGP : num 7923793 7924953 8005213 8005493 8004733 ...
## $ SYSGPSTUDY : num 1361903 1363063 1451923 1452203 1451443 ...
## $ SYSINDGP : num 7923633 7926663 8387123 8389463 8386643 ...
## $ CGI_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ GPS_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ STDCGI_ORDER : num 11 11 11 11 11 11 11 11 11 11 ...
## $ LSTUDY : chr "ADCRLPRADI" "ADFAMPRADI" "ADCONTROL" "ADCONTROL" ...
## $ DB_OWNER : chr "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
## $ STUDY : chr "ALZ" "ALZ" "ALZ" "ALZ" ...
## $ SUBSTUDY : chr "ADCRLPRADI" "ADFAMPRADI" "ADCONTROL" "ADCONTROL" ...
## $ CENTER : chr "IHG" "IHG" "IHG" "IHG" ...
## $ GP : num 87883 87923 104511 104500 104525 ...
## $ IND : num 1 9000 1 1 1 105 110 1 1 1 ...
## $ REFCTR : logi NA NA NA NA NA NA ...
## $ EXAM_DATE : POSIXct, format: "2024-02-14" "2023-10-25" ...
## $ EXAMINER : chr "gsv32" "gsv32" "jjs2031" "jjs2031" ...
## $ DATE_OF_BIRTH : POSIXct, format: "1939-03-20" "1967-06-15" ...
## $ AGE_AT_EXAM : num 84 56 77 70 91 63 65 76 81 69 ...
## $ REVIEW_DATE : logi NA NA NA NA NA NA ...
## $ REVIEWER : logi NA NA NA NA NA NA ...
## $ WLRG_PRESENT : num 1 1 2 2 2 1 1 1 1 2 ...
## $ WLRG_K : num 1 1 1 1 1 1 1 1 1 1 ...
## $ WLRG_L : num 1 1 1 1 1 1 1 1 1 1 ...
## $ WLRG_A : num 1 1 1 1 1 1 1 1 1 1 ...
## $ WLRG_M : num 1 1 0 1 1 1 1 1 1 1 ...
## $ WLRG_B : num 0 1 1 1 0 1 1 1 1 1 ...
## $ WLRG_C : num 0 1 1 1 1 1 1 1 1 1 ...
## $ WLRG_N : num 1 1 1 1 1 1 1 1 1 1 ...
## $ WLRG_D : num 0 1 1 1 0 1 0 0 0 1 ...
## $ WLRG_O : num 1 1 1 1 1 1 1 1 1 1 ...
## $ WLRG_P : num 1 1 1 1 1 1 1 1 1 1 ...
## $ WLRG_E : num 1 1 1 1 1 1 1 0 1 1 ...
## $ WLRG_F : num 0 0 1 1 1 1 1 1 1 1 ...
## $ WLRG_Q : num 1 1 1 1 1 1 1 1 1 1 ...
## $ WLRG_G : num 0 1 0 1 0 1 0 1 1 1 ...
## $ WLRG_R : num 1 1 1 1 1 1 1 1 1 1 ...
## $ WLRG_S : num 0 1 1 1 1 1 1 1 1 1 ...
## $ WLRG_H : num 0 1 1 1 0 1 1 1 0 1 ...
## $ WLRG_T : num 1 1 1 1 1 1 1 1 1 1 ...
## $ WLRG_I : num 0 1 1 1 1 1 0 1 1 1 ...
## $ WLRG_J : num 0 1 1 1 1 1 0 1 1 1 ...
## $ COMMENTS_WLRG : chr NA NA NA NA ...
## $ WLRG_YES : num 2 9 9 10 6 10 6 8 8 10 ...
## $ WLRG_YES_STATUS: logi NA NA NA NA NA NA ...
## $ WLRG_NO : num 9 10 9 10 10 10 10 10 10 10 ...
## $ WLRG_NO_STATUS : logi NA NA NA NA NA NA ...
dfDD <- read_excel(revisedDDpath, sheet = "CERAD_RECOG_RC")
## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]
## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 5 × 2
## VarNames `Data Type`
## <chr> <chr>
## 1 REFCTR VARCHAR2(6)
## 2 REVIEW_DATE date
## 3 REVIEWER CHAR
## 4 WLRG_YES_STATUS CHAR
## 5 WLRG_NO_STATUS CHAR
## select the vars to be converted to date
convert2date <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date",dfDD$`Data Type`,ignore.case = T)] ## REVIEW_DATE
## the rest should be converted to character
convert2chr <- setdiff(logicols,c(convert2num,convert2date)) ## [1] "REFCTR" "REVIEWER" "WLRG_YES_STATUS" "WLRG_NO_STATUS"
## convert
df[convert2date] <- lapply(df[convert2date], as.Date)
df[convert2chr] <- lapply(df[convert2chr], as.character)
## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "POSIXct" "POSIXt"
##
## [[4]]
## [1] "Date"
## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE" "DATE_OF_BIRTH"
## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]
## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## [1] "REVIEW_DATE" can ignore REVIEW_DATE, as it has been corrected in previous step
## [1] "REVIEW_DATE"
head(df[,datecols])
## EXAM_DATE DATE_OF_BIRTH
## 1 2024-02-14 1939-03-20
## 2 2023-10-25 1967-06-15
## 3 2023-08-11 1946-06-19
## 4 2023-08-14 1952-08-29
## 5 2023-09-18 1931-09-20
## 6 2023-06-19 1960-06-04
## convert format
df[datecols] <- lapply(df[datecols], as.Date)
## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric" "character" "Date"
## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 11 vars
## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]
mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)
## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore EXAMINER
## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 34 vars
## extract numeric variables from DD
## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]
mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)
unique(numColsfromDD$`Valid Responses`)
## [1] NA "1 thru 99999;" "1 thru 9999;" "1;\r\n2;"
## [5] "1;\r\n0;" "0;\r\n1;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP
CERAD_RECOG_RC <- df
df <- CONSENSUS_DX
info(CONSENSUS_DX,"SYSIND")
## #obs:1807, cols:43, inds:1584
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "logical"
##
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
## 'data.frame': 1807 obs. of 43 variables:
## $ SYSXM : num 7583263 7583273 7583283 7583293 7583303 ...
## $ SYSIND : num 11039963 11063713 11063723 11063703 11064573 ...
## $ SYSGP : num 7896303 7896303 7896303 7896303 7896953 ...
## $ SYSGPSTUDY : num 1311623 1311623 1311623 1311623 1312273 ...
## $ SYSINDGP : num 7795703 7822643 7822653 7822633 7823493 ...
## $ CGI_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ GPS_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ STDCGI_ORDER : num 11 11 11 11 11 11 11 11 11 11 ...
## $ LSTUDY : chr "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" ...
## $ DB_OWNER : chr "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
## $ STUDY : chr "ALZ" "ALZ" "ALZ" "ALZ" ...
## $ SUBSTUDY : chr "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" ...
## $ CENTER : chr "IHG" "IHG" "IHG" "IHG" ...
## $ GP : num 87663 87663 87663 87663 87682 ...
## $ IND : num 101 115 116 113 1008 ...
## $ REFCTR : logi NA NA NA NA NA NA ...
## $ REVIEW_DATE : POSIXct, format: "2018-07-11" "2018-07-11" ...
## $ REVIEWER : chr "v.rodriguez4" "v.rodriguez4" "v.rodriguez4" "v.rodriguez4" ...
## $ DATE_OF_BIRTH : POSIXct, format: "1943-10-18" "1939-08-25" ...
## $ RANK : num 1 1 1 1 1 1 1 1 1 1 ...
## $ CDX : chr "Alzheimers Disease" "Alzheimers Disease" "Alzheimers Disease" "Alzheimers Disease" ...
## $ SUB_DX : chr NA NA NA NA ...
## $ IMPRESSION : chr "POSSIBLE" "POSSIBLE" "POSSIBLE" "POSSIBLE" ...
## $ WHO_DX : chr "MC,KC,VR" "MC,KC,VR" "MC,KC,VR" "MC,KC,VR" ...
## $ DATE_DX : POSIXct, format: "2018-07-11" "2018-07-11" ...
## $ COMMENTS : chr NA NA NA NA ...
## $ CLINICAL_COMMENTS: logi NA NA NA NA NA NA ...
## $ OTHER_TXT1 : logi NA NA NA NA NA NA ...
## $ OTHER_TXT2 : logi NA NA NA NA NA NA ...
## $ OTHER_TXT3 : logi NA NA NA NA NA NA ...
## $ CALC_VAL1 : logi NA NA NA NA NA NA ...
## $ CALC_VAL2 : logi NA NA NA NA NA NA ...
## $ CALC_VAL3 : logi NA NA NA NA NA NA ...
## $ CALC_VAL4 : logi NA NA NA NA NA NA ...
## $ CALC_VAL5 : logi NA NA NA NA NA NA ...
## $ CALC_VAL6 : logi NA NA NA NA NA NA ...
## $ CALC_VAL7 : logi NA NA NA NA NA NA ...
## $ CALC_VAL8 : logi NA NA NA NA NA NA ...
## $ CALC_VAL9 : logi NA NA NA NA NA NA ...
## $ CALC_VAL10 : logi NA NA NA NA NA NA ...
## $ CALC_VAL11 : logi NA NA NA NA NA NA ...
## $ LAST_SOURCE : chr "CHIMERA_USER" "CHIMERA_USER" "CHIMERA_USER" "CHIMERA_USER" ...
## $ OTHER_DATE1 : logi NA NA NA NA NA NA ...
dfDD <- read_excel(revisedDDpath, sheet = "CONSENSUS_DX")
## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]
## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 17 × 2
## VarNames `Data Type`
## <chr> <chr>
## 1 REFCTR VARCHAR2(6)
## 2 CLINICAL_COMMENTS CHAR
## 3 OTHER_TXT1 CHAR
## 4 OTHER_TXT2 CHAR
## 5 OTHER_TXT3 CHAR
## 6 CALC_VAL1 NUMBER
## 7 CALC_VAL2 NUMBER
## 8 CALC_VAL3 NUMBER
## 9 CALC_VAL4 NUMBER
## 10 CALC_VAL5 NUMBER
## 11 CALC_VAL6 NUMBER
## 12 CALC_VAL7 NUMBER
## 13 CALC_VAL8 NUMBER
## 14 CALC_VAL9 NUMBER
## 15 CALC_VAL10 NUMBER
## 16 CALC_VAL11 NUMBER
## 17 OTHER_DATE1 DATE
## select the vars to be converted to numeric
convert2num <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("NUMBER", dfDD$`Data Type`)]
## 11 vars
## select the vars to be converted to date
convert2date <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date",dfDD$`Data Type`,ignore.case = T)] ## OTHER_DATE1
## the rest should be converted to character
convert2chr <- setdiff(logicols,c(convert2num,convert2date))
## "REFCTR" "CLINICAL_COMMENTS" "OTHER_TXT1" "OTHER_TXT2" "OTHER_TXT3"
## convert
df[convert2num] <- lapply(df[convert2num], as.numeric)
df[convert2date] <- lapply(df[convert2date], as.Date)
df[convert2chr] <- lapply(df[convert2chr], as.character)
## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "POSIXct" "POSIXt"
##
## [[4]]
## [1] "Date"
## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "REVIEW_DATE" "DATE_OF_BIRTH" "DATE_DX"
## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]
## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## [1] "OTHER_DATE1" can ignore OTHER_DATE1, as it has been corrected in previous step
## [1] "OTHER_DATE1"
head(df[,datecols])
## REVIEW_DATE DATE_OF_BIRTH DATE_DX
## 1 2018-07-11 1943-10-18 2018-07-11
## 2 2018-07-11 1939-08-25 2018-07-11
## 3 2018-07-11 1934-06-13 2018-07-11
## 4 2018-07-11 1924-10-24 2018-07-11
## 5 2018-07-11 1920-11-01 2018-07-11
## 6 2018-07-11 1956-06-07 2018-07-11
## convert format
df[datecols] <- lapply(df[datecols], as.Date)
## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric" "character" "Date"
## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 17 vars
## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]
mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)
## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## Ignore REVIEWER, for others, waiting for confirmation from Mike, should I add those invalid values to the DD?
## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 22 vars
## extract numeric variables from DD
## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]
mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)
unique(numColsfromDD$`Valid Responses`)
## [1] NA "1 thru 99999;" "1 thru 9999;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP
CONSENSUS_DX <- df
df <- CRAFT_21_DEL_RC
info(CRAFT_21_DEL_RC,"SYSIND")
## #obs:523, cols:95, inds:519
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "logical"
##
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
## 'data.frame': 523 obs. of 95 variables:
## $ SYSXM : num 8275923 8276563 8258913 8259013 8260163 ...
## $ SYSIND : num 11620763 11369703 11369813 11037673 11620563 ...
## $ SYSGP : num 8005723 7951913 7952013 7894423 8005633 ...
## $ SYSGPSTUDY : num 1452433 1397023 1397123 1309743 1452343 ...
## $ SYSINDGP : num 8389833 8138973 8139083 7793413 8389633 ...
## $ CGI_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ GPS_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ STDCGI_ORDER : num 11 11 11 11 11 11 11 11 11 11 ...
## $ LSTUDY : chr "ADCONTROL" "ADCRLPRADI" "ADCRLPRADI" "ADFAMPRADI" ...
## $ DB_OWNER : chr "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
## $ STUDY : chr "ALZ" "ALZ" "ALZ" "ALZ" ...
## $ SUBSTUDY : chr "ADCONTROL" "ADCRLPRADI" "ADCRLPRADI" "ADFAMPRADI" ...
## $ CENTER : chr "IHG" "IHG" "IHG" "IHG" ...
## $ GP : num 104457 88299 88301 87650 104477 ...
## $ IND : num 1 1 1 9000 1 1 1 1 1 1 ...
## $ REFCTR : logi NA NA NA NA NA NA ...
## $ EXAM_DATE : POSIXct, format: "2023-04-17" "2024-02-13" ...
## $ EXAMINER : chr "sjt82" "gsv32" "jjs2031" "gsv32" ...
## $ DATE_OF_BIRTH : POSIXct, format: "1946-12-19" "1944-09-22" ...
## $ AGE_AT_EXAM : num 76 79 76 68 73 73 81 86 86 81 ...
## $ REVIEW_DATE : logi NA NA NA NA NA NA ...
## $ REVIEWER : logi NA NA NA NA NA NA ...
## $ CRAFTDVR_ENTRY : logi NA NA NA NA NA NA ...
## $ CRAFTDTI : POSIXct, format: "2023-04-17 12:58:00" "2024-02-13 10:56:00" ...
## $ CRAFTDVR1 : num 0 0 0 1 1 1 0 1 0 0 ...
## $ CRAFTDVR2 : num 0 1 1 1 1 1 1 1 1 1 ...
## $ CRAFTDVR3 : num 0 1 0 0 0 1 0 0 0 0 ...
## $ CRAFTDVR4 : num 1 1 1 1 1 0 0 1 1 0 ...
## $ CRAFTDVR5 : num 0 0 0 1 1 1 0 1 1 1 ...
## $ CRAFTDVR6 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CRAFTDVR7 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CRAFTDVR8 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CRAFTDVR9 : num 0 0 0 1 0 0 0 0 0 0 ...
## $ CRAFTDVR10 : num 0 0 0 1 0 0 0 0 0 0 ...
## $ CRAFTDVR11 : num 1 0 0 0 0 0 0 0 0 0 ...
## $ CRAFTDVR12 : num 1 0 0 0 0 0 0 0 0 0 ...
## $ CRAFTDVR13 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CRAFTDVR14 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CRAFTDVR15 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CRAFTDVR16 : num 0 0 0 1 0 0 0 0 0 0 ...
## $ CRAFTDVR17 : num 0 0 0 1 0 0 0 0 0 0 ...
## $ CRAFTDVR18 : num 0 0 0 0 0 1 0 0 1 0 ...
## $ CRAFTDVR19 : num 0 0 0 0 0 1 0 0 1 1 ...
## $ CRAFTDVR20 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CRAFTDVR21 : num 1 0 0 0 1 1 1 0 0 0 ...
## $ CRAFTDVR22 : num 1 0 1 0 1 1 0 1 1 0 ...
## $ CRAFTDVR23 : num 1 0 0 0 0 0 0 1 0 0 ...
## $ CRAFTDVR24 : num 1 0 0 0 0 1 0 1 0 0 ...
## $ CRAFTDVR25 : num 0 0 0 0 0 1 0 0 0 0 ...
## $ CRAFTDVR26 : num 1 0 0 0 1 1 0 0 0 0 ...
## $ CRAFTDVR27 : num 0 0 0 0 1 1 0 0 0 0 ...
## $ CRAFTDVR28 : num 0 1 0 0 0 1 1 1 1 1 ...
## $ CRAFTDVR29 : num 0 0 0 0 1 1 0 0 0 0 ...
## $ CRAFTDVR30 : num 0 0 0 0 1 1 0 0 1 0 ...
## $ CRAFTDVR31 : num 0 0 0 0 1 1 0 0 1 0 ...
## $ CRAFTDVR32 : num 0 1 0 0 1 1 0 0 1 1 ...
## $ CRAFTDVR33 : num 0 0 0 0 1 0 0 0 0 0 ...
## $ CRAFTDVR34 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CRAFTDVR35 : num 0 0 0 0 1 0 0 1 0 0 ...
## $ CRAFTDVR36 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CRAFTDVR37 : num 0 0 0 0 0 0 0 0 1 0 ...
## $ CRAFTDVR38 : num 0 0 0 0 0 0 0 0 1 0 ...
## $ CRAFTDVR39 : num 0 0 0 1 1 1 0 0 1 0 ...
## $ CRAFTDVR40 : num 0 0 0 1 0 0 0 0 0 0 ...
## $ CRAFTDVR41 : num 0 0 0 1 1 1 0 0 1 1 ...
## $ CRAFTDVR42 : num 0 0 0 1 0 1 0 0 0 0 ...
## $ CRAFTDVR43 : num 0 0 0 1 1 1 0 0 1 1 ...
## $ CRAFTDVR44 : num 0 0 0 1 1 1 0 1 1 1 ...
## $ CRAFTDRE1 : num 0 0 0 1 1 1 0 1 1 0 ...
## $ CRAFTDRE2 : num 0 1 1 1 1 1 1 1 1 1 ...
## $ CRAFTDRE3 : num 0 1 0 0 0 1 0 0 0 1 ...
## $ CRAFTDRE4 : num 1 1 1 1 1 0 0 1 1 1 ...
## $ CRAFTDRE5 : num 0 0 0 1 1 1 0 1 1 1 ...
## $ CRAFTDRE6 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CRAFTDRE7 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CRAFTDRE8 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CRAFTDRE9 : num 1 1 0 0 0 0 0 0 0 0 ...
## $ CRAFTDRE10 : num 0 0 0 1 0 0 0 0 0 0 ...
## $ CRAFTDRE11 : num 0 0 0 1 0 0 0 0 0 0 ...
## $ CRAFTDRE12 : num 0 0 0 0 0 1 0 0 1 1 ...
## $ CRAFTDRE13 : num 1 0 0 0 1 1 0 0 1 1 ...
## $ CRAFTDRE14 : num 1 0 1 1 1 1 1 1 1 0 ...
## $ CRAFTDRE15 : num 1 0 0 0 0 1 0 1 0 0 ...
## $ CRAFTDRE16 : num 1 0 0 1 1 1 0 0 0 0 ...
## $ CRAFTDRE17 : num 1 1 0 1 0 1 1 1 1 1 ...
## $ CRAFTDRE18 : num 0 0 0 0 1 1 0 0 1 0 ...
## $ CRAFTDRE19 : num 0 0 0 1 1 1 0 0 1 0 ...
## $ CRAFTDRE20 : num 0 1 0 0 1 1 0 0 1 0 ...
## $ CRAFTDRE21 : num 0 0 0 0 1 1 1 1 1 1 ...
## $ CRAFTDRE22 : num 0 0 0 0 0 0 0 0 1 0 ...
## $ CRAFTDRE23 : num 0 0 0 1 1 1 0 0 1 0 ...
## $ CRAFTDRE24 : num 0 0 0 1 1 1 0 0 1 1 ...
## $ CRAFTDRE25 : num 0 0 0 1 1 1 1 0 1 1 ...
## $ CRAFTCUE : num 0 1 0 0 1 1 1 1 1 0 ...
## $ COMMENTS_CRAFTDRE: chr NA NA NA NA ...
dfDD <- read_excel(revisedDDpath, sheet = "CRAFT_21_DEL_RC")
## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]
## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 4 × 2
## VarNames `Data Type`
## <chr> <chr>
## 1 REFCTR VARCHAR2(6)
## 2 REVIEW_DATE date
## 3 REVIEWER CHAR
## 4 CRAFTDVR_ENTRY VARCHAR2(500)
## select the vars to be converted to date
convert2date <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date",dfDD$`Data Type`,ignore.case = T)] ## REVIEW_DATE
## the rest should be converted to character
convert2chr <- setdiff(logicols,c(convert2num,convert2date)) ## [1] "REFCTR" "REVIEWER" "CRAFTDVR_ENTRY"
## convert
df[convert2date] <- lapply(df[convert2date], as.Date)
df[convert2chr] <- lapply(df[convert2chr], as.character)
## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "POSIXct" "POSIXt"
##
## [[4]]
## [1] "Date"
## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE" "DATE_OF_BIRTH" "CRAFTDTI"
## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]
## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## [1] "REVIEW_DATE" can ignore REVIEW_DATE, as it has been corrected in previous step
## [1] "REVIEW_DATE"
head(df[,datecols])
## EXAM_DATE DATE_OF_BIRTH CRAFTDTI
## 1 2023-04-17 1946-12-19 2023-04-17 12:58:00
## 2 2024-02-13 1944-09-22 2024-02-13 10:56:00
## 3 2024-02-13 1947-05-13 2024-02-13 10:59:00
## 4 2023-10-24 1954-10-29 2023-10-24 14:33:00
## 5 2023-05-15 1949-12-01 2023-05-15 10:23:00
## 6 2023-05-15 1950-04-02 2023-05-15 12:07:00
## convert format
df[datecols] <- lapply(df[datecols], as.Date)
## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric" "character" "Date"
## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 10 vars
## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]
mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)
## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore EXAMINER
## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 81 vars
## extract numeric variables from DD
## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]
mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)
unique(numColsfromDD$`Valid Responses`)
## [1] NA "1 thru 99999;" "1 thru 9999;" "0;\r\n1;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP
CRAFT_21_DEL_RC <- df
df <- CRAFT_21_IMM_RC
info(CRAFT_21_IMM_RC,"SYSIND")
## #obs:530, cols:98, inds:525
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "logical"
##
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
## 'data.frame': 530 obs. of 98 variables:
## $ SYSXM : num 8258833 8258863 8260063 8277603 8277783 ...
## $ SYSIND : num 11369813 11037673 11620563 11435853 11638763 ...
## $ SYSGP : num 7952013 7894423 8005633 7962813 8007323 ...
## $ SYSGPSTUDY : num 1397123 1309743 1452343 1407923 1454033 ...
## $ SYSINDGP : num 8139083 7793413 8389633 8205123 8407833 ...
## $ CGI_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ GPS_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ STDCGI_ORDER : num 11 11 11 11 11 11 11 11 11 11 ...
## $ LSTUDY : chr "ADCRLPRADI" "ADFAMPRADI" "ADCONTROL" "ADCRLPRADI" ...
## $ DB_OWNER : chr "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
## $ STUDY : chr "ALZ" "ALZ" "ALZ" "ALZ" ...
## $ SUBSTUDY : chr "ADCRLPRADI" "ADFAMPRADI" "ADCONTROL" "ADCRLPRADI" ...
## $ CENTER : chr "IHG" "IHG" "IHG" "IHG" ...
## $ GP : num 88301 87650 104477 88452 104540 ...
## $ IND : num 1 9000 1 1 1 1 105 1 1 1 ...
## $ REFCTR : logi NA NA NA NA NA NA ...
## $ EXAM_DATE : POSIXct, format: "2024-02-13" "2023-10-24" ...
## $ EXAMINER : chr "jjs2031" "gsv32" "jjs2031" "gsv32" ...
## $ DATE_OF_BIRTH : POSIXct, format: "1947-05-13" "1954-10-29" ...
## $ AGE_AT_EXAM : num 76 68 73 81 86 86 71 73 81 79 ...
## $ REVIEW_DATE : logi NA NA NA NA NA NA ...
## $ REVIEWER : logi NA NA NA NA NA NA ...
## $ CRAFTVRS_ENTRY : logi NA NA NA NA NA NA ...
## $ CRAFTVRS_TIME : POSIXct, format: "2024-02-13 10:41:00" "2023-10-24 14:18:00" ...
## $ CRAFTVRS1 : num 1 1 1 1 1 1 0 1 0 0 ...
## $ CRAFTVRS2 : num 1 1 1 1 1 1 0 1 1 1 ...
## $ CRAFTVRS3 : num 0 0 0 0 0 0 0 0 0 1 ...
## $ CRAFTVRS4 : num 1 1 1 1 1 0 0 0 1 1 ...
## $ CRAFTVRS6 : num 0 1 0 0 0 0 0 0 0 0 ...
## $ CRAFTVRS7 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CRAFTVRS8 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CRAFTVRS5 : num 0 1 1 0 1 1 0 0 1 1 ...
## $ CRAFTVRS9 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CRAFTVRS10 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CRAFTVRS11 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CRAFTVRS12 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CRAFTVRS13 : num 0 0 0 0 1 0 0 0 0 0 ...
## $ CRAFTVRS14 : num 0 0 0 0 1 0 0 0 0 0 ...
## $ CRAFTVRS15 : num 0 0 0 0 1 0 0 0 0 0 ...
## $ CRAFTVRS16 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CRAFTVRS17 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CRAFTVRS18 : num 0 0 1 0 0 0 0 0 1 1 ...
## $ CRAFTVRS19 : num 0 0 1 0 0 0 0 0 1 1 ...
## $ CRAFTVRS20 : num 0 0 1 0 0 1 0 0 0 1 ...
## $ CRAFTVRS21 : num 0 1 1 0 0 0 0 1 1 1 ...
## $ CRAFTVRS22 : num 1 1 1 1 1 1 0 1 1 1 ...
## $ CRAFTVRS23 : num 0 0 1 0 0 0 0 1 0 0 ...
## $ CRAFTVRS24 : num 0 0 1 0 0 0 0 1 1 0 ...
## $ CRAFTVRS25 : num 0 1 1 0 0 0 0 1 0 1 ...
## $ CRAFTVRS26 : num 0 1 1 0 1 0 0 1 0 1 ...
## $ CRAFTVRS27 : num 0 1 1 0 1 0 0 1 0 1 ...
## $ CRAFTVRS28 : num 1 1 1 1 1 0 1 0 1 0 ...
## $ CRAFTVRS29 : num 0 1 0 0 0 1 0 0 0 0 ...
## $ CRAFTVRS30 : num 0 1 1 0 0 1 0 0 0 0 ...
## $ CRAFTVRS31 : num 0 0 0 0 0 1 0 0 1 0 ...
## $ CRAFTVRS32 : num 0 1 1 0 1 1 0 1 1 1 ...
## $ CRAFTVRS33 : num 0 1 1 0 1 0 0 1 0 1 ...
## $ CRAFTVRS34 : num 0 1 1 0 1 0 0 0 0 1 ...
## $ CRAFTVRS35 : num 0 0 1 0 1 1 0 0 0 1 ...
## $ CRAFTVRS36 : num 0 0 1 0 0 0 0 0 0 0 ...
## $ CRAFTVRS37 : num 0 0 1 0 0 0 0 0 0 0 ...
## $ CRAFTVRS38 : num 0 0 1 0 0 1 0 0 0 0 ...
## $ CRAFTVRS39 : num 0 1 0 0 0 1 0 0 0 1 ...
## $ CRAFTVRS40 : num 0 1 0 0 0 0 0 0 0 1 ...
## $ CRAFTVRS41 : num 0 1 1 0 1 1 0 0 1 1 ...
## $ CRAFTVRS42 : num 0 1 1 0 1 0 0 0 0 1 ...
## $ CRAFTVRS43 : num 0 1 1 0 1 1 0 0 1 0 ...
## $ CRAFTVRS44 : num 0 1 1 1 1 0 1 0 1 0 ...
## $ CRAFTURS1 : num 1 1 1 1 1 1 0 1 0 0 ...
## $ CRAFTURS2 : num 1 1 1 1 1 1 0 1 1 1 ...
## $ CRAFTURS3 : num 0 1 0 0 0 0 0 0 0 1 ...
## $ CRAFTURS4 : num 1 1 1 1 1 1 0 0 1 1 ...
## $ CRAFTURS5 : num 0 1 1 0 1 1 0 0 1 1 ...
## $ CRAFTURS6 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CRAFTURS7 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CRAFTURS8 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CRAFTURS9 : num 1 0 0 0 1 0 0 0 0 0 ...
## $ CRAFTURS10 : num 0 1 0 0 0 0 0 0 0 0 ...
## $ CRAFTURS11 : num 0 1 0 0 0 0 0 0 0 0 ...
## $ CRAFTURS12 : num 0 1 1 0 0 0 0 0 1 1 ...
## $ CRAFTURS13 : num 0 1 1 0 0 1 0 1 1 1 ...
## $ CRAFTURS14 : num 1 1 1 1 1 1 1 1 1 1 ...
## $ CRAFTURS15 : num 0 1 1 0 0 0 0 1 1 0 ...
## $ CRAFTURS16 : num 0 1 1 0 1 0 0 1 0 1 ...
## $ CRAFTURS17 : num 1 1 1 1 1 1 1 0 1 0 ...
## $ CRAFTURS18 : num 0 1 0 0 0 1 0 0 0 0 ...
## $ CRAFTURS19 : num 0 1 1 0 0 1 0 1 0 1 ...
## $ CRAFTURS20 : num 0 0 1 0 1 1 1 1 1 1 ...
## $ CRAFTURS21 : num 0 0 1 1 1 1 0 0 1 1 ...
## $ CRAFTURS22 : num 0 0 1 0 0 1 0 0 0 0 ...
## $ CRAFTURS24 : num 0 1 1 0 1 0 0 0 1 1 ...
## $ CRAFTURS23 : num 0 1 0 0 1 1 0 0 0 1 ...
## $ CRAFTURS25 : num 0 1 1 1 1 1 0 0 1 0 ...
## $ COMMENTS_CRAFTVRS : chr NA NA NA NA ...
## $ SCORE_CRAFTVRS : num 5 22 27 6 19 14 2 11 14 20 ...
## $ SCORE_CRAFTVRS_STATUS: chr NA NA NA NA ...
## $ SCORE_CRAFTURS : num 6 18 16 7 13 14 3 8 12 13 ...
## $ SCORE_CRAFTURS_STATUS: chr NA NA NA NA ...
dfDD <- read_excel(revisedDDpath, sheet = "CRAFT_21_IMM_RC")
## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]
## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 4 × 2
## VarNames `Data Type`
## <chr> <chr>
## 1 REFCTR VARCHAR2(6)
## 2 REVIEW_DATE date
## 3 REVIEWER CHAR
## 4 CRAFTVRS_ENTRY VARCHAR2(500)
## select the vars to be converted to date
convert2date <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date",dfDD$`Data Type`,ignore.case = T)] ## REVIEW_DATE
## the rest should be converted to character
convert2chr <- setdiff(logicols,c(convert2num,convert2date)) ## "REFCTR" "REVIEWER" "CRAFTVRS_ENTRY"
## convert
df[convert2date] <- lapply(df[convert2date], as.Date)
df[convert2chr] <- lapply(df[convert2chr], as.character)
## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "POSIXct" "POSIXt"
##
## [[4]]
## [1] "Date"
## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE" "DATE_OF_BIRTH" "CRAFTVRS_TIME"
## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]
## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## [1] "REVIEW_DATE" can ignore REVIEW_DATE, as it has been corrected in previous step
## [1] "REVIEW_DATE"
head(df[,datecols])
## EXAM_DATE DATE_OF_BIRTH CRAFTVRS_TIME
## 1 2024-02-13 1947-05-13 2024-02-13 10:41:00
## 2 2023-10-24 1954-10-29 2023-10-24 14:18:00
## 3 2023-05-15 1949-12-01 2023-05-15 10:12:00
## 4 2024-02-15 1942-09-30 2024-02-15 14:22:00
## 5 2023-09-13 1937-08-13 2023-09-13 10:02:00
## 6 2023-05-09 1936-05-22 2023-05-09 11:46:00
## convert format
## I will leave CRAFTVRS_TIME with format POSIXct since it contains the timestamp
## and I will convert the other two to date format
datecols <- setdiff(datecols, "CRAFTVRS_TIME")
df[datecols] <- lapply(df[datecols], as.Date)
## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "Date"
##
## [[4]]
## [1] "POSIXct" "POSIXt"
## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 12 vars
## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]
mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)
## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore EXAMINER
## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 82 vars
## extract numeric variables from DD
## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]
mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)
unique(numColsfromDD$`Valid Responses`)
## [1] NA "1 thru 99999;" "1 thru 9999;" "0;\r\n1;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP
CRAFT_21_IMM_RC <- df
df <- MEDCON_RC
info(MEDCON_RC,"SYSIND")
## #obs:627, cols:237, inds:618
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "logical"
##
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
## 'data.frame': 627 obs. of 237 variables:
## $ SYSXM : num 8258763 8258803 8260083 8277583 8277993 ...
## $ SYSIND : num 11037673 11369813 11362953 11435853 11621333 ...
## $ SYSGP : num 7894423 7952013 7946353 7962813 8006293 ...
## $ SYSGPSTUDY : num 1309743 1397123 1387463 1407923 1453003 ...
## $ SYSINDGP : num 7793413 8139083 8132223 8205123 8390403 ...
## $ CGI_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ GPS_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ STDCGI_ORDER : num 11 11 11 11 11 11 11 11 11 11 ...
## $ LSTUDY : chr "ADFAMPRADI" "ADCRLPRADI" "ADFAMPRADI" "ADCRLPRADI" ...
## $ DB_OWNER : chr "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
## $ STUDY : chr "ALZ" "ALZ" "ALZ" "ALZ" ...
## $ SUBSTUDY : chr "ADFAMPRADI" "ADCRLPRADI" "ADFAMPRADI" "ADCRLPRADI" ...
## $ CENTER : chr "IHG" "IHG" "IHG" "IHG" ...
## $ GP : num 87650 88301 87545 88452 104528 ...
## $ IND : num 9000 1 106 1 1 ...
## $ REFCTR : logi NA NA NA NA NA NA ...
## $ EXAM_DATE : POSIXct, format: "2023-10-24" "2024-02-13" ...
## $ EXAMINER : chr "gsv32" "jjs2031" "jjs2031" "gsv32" ...
## $ DATE_OF_BIRTH : POSIXct, format: "1954-10-29" "1947-05-13" ...
## $ AGE_AT_EXAM : num 68 76 66 81 86 86 60 81 79 67 ...
## $ REVIEW_DATE : logi NA NA NA NA NA NA ...
## $ REVIEWER : logi NA NA NA NA NA NA ...
## $ MEMORY_COMPLAINTS : num 0 0 1 0 0 1 0 0 0 1 ...
## $ DATE_OF_ONSET : POSIXct, format: NA NA ...
## $ DOA_UNK : chr NA NA NA NA ...
## $ DESCRIBE : chr "hysterectomy (1987), no HET, knee surgery 2010, carpal tunnel surgery 2011" NA NA "Hypertension x 20yrs; left knee prothesis due to osteoarthrosis, right knee in need of surgery; hypercholest.; "| __truncated__ ...
## $ MEM_COMPLAINTS : chr "68 y/o mixed female born in PR. Oriented in time, space and person. No memory complaints, however complaints of"| __truncated__ "NO MEMORY COMPLAINTS. PERSON ORIENTED IN TIME, SPACE, AND PERSON. HE LIVES ALONE, HE DOESN'T NEED HELP TO CHANG"| __truncated__ "YES MEMORY COMPLAINTS. ORIENTED EN TIME, SPACE, AND PERSONA. SHE SAYS THAT HER MEMORY WAS FULL AND WELL DURING "| __truncated__ "Refers no major changes in memory. Remembers phone numbers and addresses well. He is 81y/o, with 9yrs of educ"| __truncated__ ...
## $ CURRENT_MED : chr "high blood pressure, sleep apnea, diabetes (10 years ago) arthritis (13 years ago)" "DM 10 Y/0 HIGH BLOOD PRESSURE 10 Y/0" "HYPOTHYROIDISM 30 Y/O CHOLESTEROL 10 Y/O DM 5 YEARS AGO BREAST CANCER 2013 DEPRESSION 2013 ASTHMA 4 Y/O ARTHIRITIS 2017." "see above" ...
## $ PMH : chr NA NA NA "see above" ...
## $ MOOD_CHANGES : chr "None reported" "NO DEPRESSION OR ANXIETY" "YES DEPRESSION AND ANXEITY" "H/o depression and anxiety x 20yrs, with meds, was with psychiatrist but not anymore" ...
## $ HYPERTENSION_DX : num 1 1 0 1 1 1 0 0 1 1 ...
## $ HYPERTENSION_TREATED : num 1 1 -1 1 1 1 0 -1 1 1 ...
## $ DIABETES_DX : num 1 1 1 1 1 0 0 0 1 1 ...
## $ DIABETES_TREATED : num 1 1 1 1 1 0 0 -1 1 1 ...
## $ MYOCARDIAL_DX : num 0 0 0 0 0 0 0 0 0 0 ...
## $ MYOCARDIAL_TREATED : num 0 -1 -1 NA -1 0 0 -1 NA -1 ...
## $ HEART_FAILURE_DX : num 0 0 0 0 0 0 0 0 0 0 ...
## $ HEART_FAILURE_TREATED : num 0 -1 -1 NA -1 0 0 -1 NA -1 ...
## $ HEART_DISEASE_DX : num 0 0 0 0 0 0 0 0 0 0 ...
## $ HEART_DISEASE_TREATED : num 0 -1 -1 NA -1 0 0 -1 NA -1 ...
## $ COPD_DX : num 0 0 0 0 0 0 0 0 0 0 ...
## $ COPD_TREATED : num 0 -1 -1 NA -1 0 0 -1 NA -1 ...
## $ THYROID_DX : num 0 0 1 0 0 0 0 0 0 0 ...
## $ THYROID_TREATED : num 0 -1 1 NA -1 0 0 -1 NA -1 ...
## $ LIVER_DX : num 0 0 0 0 0 0 0 0 0 0 ...
## $ LIVER_TREATED : num 0 -1 -1 NA -1 0 0 -1 NA -1 ...
## $ RENAL_DX : num 0 0 0 0 1 0 0 0 1 0 ...
## $ RENAL_TREATED : num 0 -1 -1 NA 1 0 0 -1 NA -1 ...
## $ PEPTIC_DX : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PEPTIC_TREATED : num 0 -1 -1 NA -1 0 0 -1 NA -1 ...
## $ PERIPHERAL_DX : num 1 0 0 1 0 0 0 0 0 1 ...
## $ PERIPHERAL_TREATED : num 0 -1 -1 1 -1 0 0 -1 NA 0 ...
## $ STROKE_DX : num 0 0 0 0 0 0 0 0 0 0 ...
## $ STROKE_TREATED : num 0 -1 -1 NA -1 0 0 -1 NA -1 ...
## $ TIA_DX : num 0 0 0 0 0 0 0 0 0 0 ...
## $ TIA_TREATED : num 0 -1 -1 NA -1 0 0 -1 NA -1 ...
## $ HEAD_INJURY_DX : num 0 0 0 0 0 0 0 0 0 0 ...
## $ HEAD_INJURY_TREATED : num 0 -1 -1 NA -1 0 0 -1 NA -1 ...
## $ SEIZURE_DX : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SEIZURE_TREATED : num 0 -1 -1 NA -1 0 0 -1 NA -1 ...
## $ CANCER_DX : num 0 0 1 0 1 0 0 0 0 0 ...
## $ CANCER_TREATED : num 0 -1 0 NA 1 0 0 -1 NA -1 ...
## $ ARTHRITIS_DX : num 1 0 1 1 1 1 0 1 1 1 ...
## $ ARTHRITIS_TREATED : num 1 -1 1 1 1 1 0 1 1 0 ...
## $ SYPHILIS_DX : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SYPHILIS_TREATED : num 0 -1 -1 0 -1 0 0 -1 NA -1 ...
## $ ALCOHOL_DX : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ALCOHOL_TREATED : num 0 -1 -1 0 -1 0 0 -1 NA -1 ...
## $ ILLICIT_DRUG_DX : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ILLICIT_DRUG_TREATED : num 0 -1 -1 0 -1 0 0 -1 NA -1 ...
## $ SMOKING_DX : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SMOKING_TREATED : num 0 -1 -1 0 -1 0 0 -1 NA -1 ...
## $ PD_DX : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PD_TREATED : num 0 -1 -1 0 -1 0 0 -1 NA -1 ...
## $ HUNTINGTON_DX : num 0 0 0 0 0 0 0 0 0 0 ...
## $ HUNTINGTON_TREATED : num 0 -1 -1 NA -1 0 0 -1 NA -1 ...
## $ MULTIPLE_SCLEROSIS_DX : num 0 0 0 0 0 0 0 0 0 0 ...
## $ MULTIPLE_SCLEROSIS_TREATED: num 0 -1 -1 NA -1 0 0 -1 NA -1 ...
## $ B12_DX : num 0 0 0 0 0 0 0 0 0 0 ...
## $ B12_TREATED : num 0 -1 -1 NA -1 0 0 -1 NA -1 ...
## $ HYDROCEPHALUS_DX : num 0 0 0 0 0 0 0 0 0 0 ...
## $ HYDROCEPHALUS_TREATED : num 0 -1 -1 NA -1 0 0 -1 NA -1 ...
## $ TREMOR_DX : num 0 0 0 0 0 0 0 0 0 0 ...
## $ TREMOR_TREATED : num 0 -1 -1 NA -1 0 0 -1 NA -1 ...
## $ DOWN_SYNDROME_DX : num 0 0 0 0 0 0 0 0 0 0 ...
## $ DOWN_SYNDROME_TREATED : num 0 -1 -1 NA -1 0 0 -1 NA -1 ...
## $ MED_CONDITIONS_DX : num 0 0 0 1 0 0 0 0 0 0 ...
## $ MED_CONDITIONS_TREATED : num 0 -1 -1 1 -1 0 0 -1 NA -1 ...
## $ OTH_MED_COND_SP : chr NA NA NA "depression and anxiety" ...
## $ STROKE_BRAIN : num 0 0 0 0 0 0 0 0 0 0 ...
## $ DOCTOR : num NA 9 9 NA 9 0 NA 9 0 9 ...
## $ STROKE_PAST : num NA 9 9 NA 9 0 NA 9 0 9 ...
## $ STROKE_24HRS : num NA 9 9 NA 9 0 NA 9 0 9 ...
## $ SYMPTOMS : num NA 9 9 NA 9 0 NA 9 0 9 ...
## $ LOST_SPEECH : num 0 0 0 0 0 0 0 0 0 0 ...
## $ LOST_UNDERSTAND : num 0 0 0 0 0 0 0 0 0 0 ...
## $ LOSS_CONSCIOUS : num 0 0 0 0 0 0 0 0 0 0 ...
## $ WEAKNESS : num 0 0 0 0 0 0 0 0 0 0 ...
## $ NUMBNESS : num 0 0 0 0 0 0 0 0 0 0 ...
## $ LOSS_VISION : num 0 0 0 0 0 0 0 0 0 0 ...
## $ HALF_VISION : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PERIOD : num 1 0 1 1 1 1 0 1 1 0 ...
## $ AGE_C24A : num 67 NA 55 60 59 86 NA 77 70 NA ...
## $ DONT_KNOW : chr NA NA NA NA ...
## $ SEEK_HELP : num 1 NA 1 1 0 1 NA 1 1 NA ...
## $ TREATMENT : num 0 NA 0 0 0 0 NA 0 0 NA ...
## $ MEDS : num 1 NA 1 1 0 0 NA 1 1 NA ...
## $ PSYCHOTHERAPY : num 1 NA 1 1 0 1 NA 1 1 NA ...
## $ OTHER : num 0 NA 0 0 0 0 NA 0 0 NA ...
## $ UNKNOWN : num 0 NA 0 0 0 0 NA 0 0 NA ...
## $ SPECIFY_OTHER : chr NA NA NA NA ...
## $ TAKING_MEDS : num 1 0 1 1 1 1 1 1 1 1 ...
## $ MEDICATION1 : chr "aspirin" NA "LEVOTHYROXINE" "doesn't remember medications" ...
## $ STRENGTH1 : chr "81 mg" NA "112 MG DAILY" NA ...
## $ SEEN1 : num 0 NA 0 0 0 NA 0 0 NA 0 ...
## $ MEDICATION2 : chr "lipidol" NA "MEMANTINE HCL" NA ...
## $ STRENGTH2 : chr "20 mg" NA "10 MG TWICE DAILY" NA ...
## $ SEEN2 : chr "0" NA "0" NA ...
## $ MEDICATION3 : chr "zyrtec" NA "XISDUO XR(METFORMIN HCL)" NA ...
## $ STRENGTH3 : chr "10 mg" NA "5MG/1000MG" NA ...
## $ SEEN3 : num 0 NA 0 NA 0 NA NA 0 NA 0 ...
## $ MEDICATION4 : chr "vitamin D3" NA "ESCITALOPRAM" NA ...
## $ STRENGTH4 : chr "50,000 d" NA "20 MG 1 DAILY" NA ...
## $ SEEN4 : chr "0" NA "0" NA ...
## $ MEDICATION5 : chr "folic acid" NA "ATORVASTATIN CALCIUM" NA ...
## $ STRENGTH5 : chr "1 mg" NA "20 MG" NA ...
## $ SEEN5 : num 0 NA 0 NA 0 NA NA 0 NA NA ...
## $ MEDICATION6 : chr "daflonex" NA "FOROTIDINE" NA ...
## $ STRENGTH6 : chr "XL as indicated" NA "20 MG DAILY" NA ...
## $ SEEN6 : chr "0" NA "0" NA ...
## $ MEDICATION7 : chr "methenamine" NA "LISINIPROL" NA ...
## $ STRENGTH7 : chr "500 mg" NA "10 MG DAILY" NA ...
## $ SEEN7 : num 0 NA 0 NA 0 NA NA 0 NA NA ...
## $ MEDICATION8 : chr "methnotexate" NA "MONTELUKAST SODIUM" NA ...
## $ STRENGTH8 : chr "2.5 mg" NA "10 MG DAILY" NA ...
## $ SEEN8 : chr "0" NA "0" NA ...
## $ MEDICATION9 : chr "lexapro" NA "FOLIC ACID" NA ...
## $ STRENGTH9 : chr "10 mg" NA "1 MG DAILY" NA ...
## $ SEEN9 : num 0 NA 0 NA 0 NA NA NA NA NA ...
## $ MEDICATION10 : chr "frova" NA "VITAMIN D" NA ...
## $ STRENGTH10 : chr "2.5 mg" NA NA NA ...
## $ SEEN10 : chr "0" NA "0" NA ...
## $ MEDICATION11 : chr "mirapen" NA "BIOTIN" NA ...
## $ STRENGTH11 : chr ".5 mg" NA NA NA ...
## $ SEEN11 : num 0 NA 0 NA NA NA NA NA NA NA ...
## $ MEDICATION12 : chr "lyrica" NA NA NA ...
## $ STRENGTH12 : chr "25 mg" NA NA NA ...
## $ SEEN12 : chr NA NA NA NA ...
## $ MEDICATION13 : chr "lisinopril" NA NA NA ...
## $ STRENGTH13 : chr "5 mg" NA NA NA ...
## $ SEEN13 : num NA NA NA NA NA NA NA NA NA NA ...
## $ MEDICATION14 : chr "pepcid" NA NA NA ...
## $ STRENGTH14 : chr "20 mg" NA NA NA ...
## $ SEEN14 : chr NA NA NA NA ...
## $ MEDICATION15 : chr NA NA NA NA ...
## $ STRENGTH15 : chr NA NA NA NA ...
## $ SEEN15 : num NA NA NA NA NA NA NA NA NA NA ...
## $ MEDICATION16 : chr NA NA NA NA ...
## $ STRENGTH16 : chr NA NA NA NA ...
## $ SEEN16 : chr NA NA NA NA ...
## $ MEDICATION17 : chr NA NA NA NA ...
## $ STRENGTH17 : chr NA NA NA NA ...
## $ SEEN17 : num NA NA NA NA NA NA NA NA NA NA ...
## $ MEDICATION18 : chr NA NA NA NA ...
## $ STRENGTH18 : logi NA NA NA NA NA NA ...
## $ SEEN18 : chr NA NA NA NA ...
## $ MEDICATION19 : chr NA NA NA NA ...
## $ STRENGTH19 : chr NA NA NA NA ...
## $ SEEN19 : num NA NA NA NA NA NA NA NA NA NA ...
## $ MEDICATION20 : logi NA NA NA NA NA NA ...
## $ STRENGTH20 : logi NA NA NA NA NA NA ...
## $ SEEN20 : logi NA NA NA NA NA NA ...
## $ NOTES_MEDLIST : chr NA NA NA NA ...
## $ WARFARIN : num NA NA NA NA NA NA NA NA NA NA ...
## $ ASPIRIN : num NA NA NA NA NA NA NA NA NA NA ...
## $ ANTIPLATELETS : num NA NA NA NA NA NA NA NA NA NA ...
## $ DIURETICS : num NA NA NA NA NA NA NA NA NA NA ...
## $ ANTICONVULSANTS : num NA NA NA NA NA NA NA NA NA NA ...
## $ INSULIN : num NA NA NA NA NA NA NA NA NA NA ...
## $ HYPOGLYCEMICS : num NA NA NA NA NA NA NA NA NA NA ...
## $ SULFONYLUREA : num NA NA NA NA NA NA NA NA NA NA ...
## $ METFORMIN : num NA NA NA NA NA NA NA NA NA NA ...
## $ GLITAZONES : num NA NA NA NA NA NA NA NA NA NA ...
## $ DIGITALIS : num NA NA NA NA NA NA NA NA NA NA ...
## $ NITRATES : num NA NA NA NA NA NA NA NA NA NA ...
## $ CALCIUM_CHANNEL : num NA NA NA NA NA NA NA NA NA NA ...
## $ BETA_2_AGAONIST : num NA NA NA NA NA NA NA NA NA NA ...
## $ BETA_BLOCKERS : num NA NA NA NA NA NA NA NA NA NA ...
## $ ACE : num NA NA NA NA NA NA NA NA NA NA ...
## $ ANTI_ARRHYTHMICS : num NA NA NA NA NA NA NA NA NA NA ...
## $ ANTI_HYPERLIPIDEMICS : num NA NA NA NA NA NA NA NA NA NA ...
## $ STATIN_DRUG : num NA NA NA NA NA NA NA NA NA NA ...
## $ FIBRATE_DRUG : num NA NA NA NA NA NA NA NA NA NA ...
## $ THYROID : num NA NA NA NA NA NA NA NA NA NA ...
## $ ANTICHOLINERGICS : num NA NA NA NA NA NA NA NA NA NA ...
## $ LEVODOPA : num NA NA NA NA NA NA NA NA NA NA ...
## $ DOPAMINE1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ ANTIDEPRESSANTS : num NA NA NA NA NA NA NA NA NA NA ...
## $ ANTIPSYCHOTICS : num NA NA NA NA NA NA NA NA NA NA ...
## $ ANXIOLYTICS : num NA NA NA NA NA NA NA NA NA NA ...
## $ CHOLINESTERASE : num NA NA NA NA NA NA NA NA NA NA ...
## $ RIVASTIGMINE : num NA NA NA NA NA NA NA NA NA NA ...
## $ TACRINE : num NA NA NA NA NA NA NA NA NA NA ...
## $ DONEPEZIL : num NA NA NA NA NA NA NA NA NA NA ...
## $ GALANTAMINE : num NA NA NA NA NA NA NA NA NA NA ...
## $ NMDA : num NA NA NA NA NA NA NA NA NA NA ...
## $ MEMANTINE : num NA NA NA NA NA NA NA NA NA NA ...
## $ ALPHA_BLOCKERS : num NA NA NA NA NA NA NA NA NA NA ...
## $ HYPNOTICS : num NA NA NA NA NA NA NA NA NA NA ...
## $ H1_BLOCKERS : num NA NA NA NA NA NA NA NA NA NA ...
## $ H2_BLOCKERS : num NA NA NA NA NA NA NA NA NA NA ...
## $ NSAID : num NA NA NA NA NA NA NA NA NA NA ...
## $ COX2 : num NA NA NA NA NA NA NA NA NA NA ...
## $ NARCOTICS : num NA NA NA NA NA NA NA NA NA NA ...
## $ HYDERGINE : num NA NA NA NA NA NA NA NA NA NA ...
## $ DEPRENYL : num NA NA NA NA NA NA NA NA NA NA ...
## $ ESTROGEN_SUPP : num NA NA NA NA NA NA NA NA NA NA ...
## $ PRESCRIPTION : num NA NA NA NA NA NA NA NA NA NA ...
## $ OTC : num NA NA NA NA NA NA NA NA NA NA ...
## $ STEROIDS : num NA NA NA NA NA NA NA NA NA NA ...
## $ OTHER_MEDS : num NA NA NA NA NA NA NA NA NA NA ...
## $ C57_SPEC_MEDS : chr NA NA NA NA ...
## $ MULTIVITAMINS : num NA NA NA NA NA NA NA NA NA NA ...
## $ VITAMIN_C : num NA NA NA NA NA NA NA NA NA NA ...
## $ VITAMIN_E : num NA NA NA NA NA NA NA NA NA NA ...
## $ VITAMINE_B12 : num NA NA NA NA NA NA NA NA NA NA ...
## $ COENZYME_Q : num NA NA NA NA NA NA NA NA NA NA ...
## $ DHA : num NA NA NA NA NA NA NA NA NA NA ...
## $ LECITHIN : num NA NA NA NA NA NA NA NA NA NA ...
## $ GINKGO : num NA NA NA NA NA NA NA NA NA NA ...
## $ FOLIC_ACID : num NA NA NA NA NA NA NA NA NA NA ...
## $ VITAMIN_B6 : num NA NA NA NA NA NA NA NA NA NA ...
## $ VITAMIN_D : num NA NA NA NA NA NA NA NA NA NA ...
## $ OMEGA3 : num NA NA NA NA NA NA NA NA NA NA ...
## $ MEDCOND_COMENTS : chr NA NA NA NA ...
## $ MED_CONDITIONS_HIV : num 0 0 0 0 0 0 0 0 NA 0 ...
## $ MED_CONDITIONS_HIV_TX : num 0 -1 -1 NA -1 0 0 -1 NA -1 ...
dfDD <- read_excel(revisedDDpath, sheet = "MEDCON_RC")
## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)] ## 7 vars
## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 7 × 2
## VarNames `Data Type`
## <chr> <chr>
## 1 REFCTR VARCHAR2(6)
## 2 REVIEW_DATE DATE
## 3 REVIEWER CHAR
## 4 STRENGTH18 VARCHAR2(30)
## 5 MEDICATION20 VARCHAR2(30)
## 6 STRENGTH20 VARCHAR2(30)
## 7 SEEN20 NUMBER(1)
## converted to character
convert2chr <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("CHAR", dfDD$`Data Type`,ignore.case = T)]
convert2date <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date", dfDD$`Data Type`,ignore.case = T)]
convert2num <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("NUMBER", dfDD$`Data Type`,ignore.case = T)]
## convert
df[convert2chr] <- lapply(df[convert2chr], as.character)
df[convert2date] <- lapply(df[convert2date], as.Date)
df[convert2num] <- lapply(df[convert2num], as.numeric)
## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "POSIXct" "POSIXt"
##
## [[4]]
## [1] "Date"
## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE" "DATE_OF_BIRTH" "DATE_OF_ONSET"
## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]
## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## REVIEW_DATE, ignore it, it has been corrected in previous step
## [1] "REVIEW_DATE"
head(df[,datecols])
## EXAM_DATE DATE_OF_BIRTH DATE_OF_ONSET
## 1 2023-10-24 1954-10-29 <NA>
## 2 2024-02-13 1947-05-13 <NA>
## 3 2024-02-20 1957-08-05 2021-06-01
## 4 2024-02-15 1942-09-30 <NA>
## 5 2023-05-09 1936-05-22 <NA>
## 6 2023-09-13 1937-08-13 2023-04-01
## convert format
df[datecols] <- lapply(df[datecols], as.Date)
## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric" "character" "Date"
## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 69 vars
## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]
mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## 9 vars
## [1] "SEEN2" "SEEN4" "SEEN6" "SEEN8" "SEEN10" "SEEN12" "SEEN14" "SEEN16" "SEEN18"
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)
## convert mismatchChrs_1 vars to numeric
df[mismatchChrs_1] <- lapply(df[mismatchChrs_1], as.numeric)
## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## All values are within valid ranges.
## ignore EXAMINER
## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 164 vars
## extract numeric variables from DD
## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]
mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)
unique(numColsfromDD$`Valid Responses`)
## [1] NA "1 thru 99999;"
## [3] "1 thru 9999;" "0;\r\n1;"
## [5] "0;\r\n1;\r\n9;\r\n-1;" "0;\r\n1;\r\n7;\r\n8;\r\n9;"
## [7] "0;\r\n1;\r\n9;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP
MEDCON_RC <- df
df <- MEDICAL_HIST
info(MEDICAL_HIST,"SYSIND")
## #obs:889, cols:53, inds:871
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "logical"
##
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
## 'data.frame': 889 obs. of 53 variables:
## $ SYSXM : num 7606563 7493573 7592623 7576033 7596083 ...
## $ SYSIND : num 11163223 11037553 11160533 11158043 11007943 ...
## $ SYSGP : num 7924813 7894373 7896973 7896073 7888893 ...
## $ SYSGPSTUDY : num 1362923 1309693 1312293 1311393 1304233 ...
## $ SYSINDGP : num 7926433 7793293 7923643 7921153 7762743 ...
## $ CGI_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ GPS_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ STDCGI_ORDER : num 11 11 11 11 11 11 11 11 11 11 ...
## $ LSTUDY : chr "ADCRLPRADI" "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" ...
## $ DB_OWNER : chr "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
## $ STUDY : chr "ALZ" "ALZ" "ALZ" "ALZ" ...
## $ SUBSTUDY : chr "ADCRLPRADI" "ADFAMPRADI" "ADFAMPRADI" "ADFAMPRADI" ...
## $ CENTER : chr "IHG" "IHG" "IHG" "IHG" ...
## $ GP : num 87927 87502 87684 87511 87564 ...
## $ IND : num 1 1 103 9006 100 ...
## $ REFCTR : logi NA NA NA NA NA NA ...
## $ EXAM_DATE : POSIXct, format: "2018-10-17" "2018-01-08" ...
## $ EXAMINER : chr "v.rodriguez4" "axr1589" "axr1589" "axr1589" ...
## $ DATE_OF_BIRTH : POSIXct, format: "1933-05-18" "1952-07-06" ...
## $ AGE_AT_EXAM : num 85 65 59 60 68 72 77 72 67 88 ...
## $ XMSTUDY : chr "ALZ" "ALZ" "ALZ" "ALZ" ...
## $ RELATION : chr "Parent" "Spouse" "Spouse" "Spouse" ...
## $ ANXIETY : chr "Y" "N" "Y" "Y" ...
## $ ASTHMA : chr "N" "N" "N" "N" ...
## $ A_D_D : chr "N" "U" "N" "N" ...
## $ AUTISM : chr "N" "N" "N" "N" ...
## $ CANCER : chr "N" "N" "N" "N" ...
## $ CANCER_TYPE : chr NA NA NA NA ...
## $ DEPRESSION : chr "Y" "N" "N" "Y" ...
## $ DIABETES_TYPE1 : chr "N" "N" "N" "N" ...
## $ DIABETES_TYPE2 : chr "N" "N" "Y" "N" ...
## $ DIABETES : chr "N" "N" "Y" "N" ...
## $ LIPIDS_CHOL : chr "Y" "N" "N" "Y" ...
## $ EPILEPSY : chr "N" "N" "N" "N" ...
## $ GASTRIC_ULCERS : chr "N" "N" "N" "N" ...
## $ HEART_DISEASE : chr "N" "N" "N" "N" ...
## $ HYPERTENSION : chr "Y" "N" "Y" "N" ...
## $ KIDNEY_DISEASE : chr "N" "N" "N" "N" ...
## $ LIVER_DISEASE : chr "N" "N" "N" "N" ...
## $ DEMENTIA : chr "Y" "Y" "Y" "Y" ...
## $ MIGRAINES : chr "N" "N" "N" "U" ...
## $ M_SCLEROSIS : chr "N" "N" "N" "N" ...
## $ OBS_COMPULSIVE : chr "N" "N" "N" "Y" ...
## $ OSTEOARTHRITIS : chr "N" "N" "N" "N" ...
## $ OSTEOPOROSIS : chr "Y" "N" "N" "N" ...
## $ PD : chr "N" "N" "N" "N" ...
## $ ARTHRITIS : chr "N" "N" "N" "N" ...
## $ RHINITIS : chr "N" "N" "N" "N" ...
## $ SPINA_BIFIDA : chr "N" "N" "N" "N" ...
## $ STROKE : chr "N" "N" "N" "N" ...
## $ THYROID_DISEASE: chr "Y" "N" "N" "Y" ...
## $ CIGARETTES : logi NA NA NA NA NA NA ...
## $ CURR_MEDS : logi NA NA NA NA NA NA ...
dfDD <- read_excel(revisedDDpath, sheet = "MEDICAL_HIST")
## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]
## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 3 × 2
## VarNames `Data Type`
## <chr> <chr>
## 1 REFCTR VARCHAR2(6)
## 2 CIGARETTES CHAR
## 3 CURR_MEDS CHAR
## converted to character
convert2chr <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("CHAR", dfDD$`Data Type`)]
## convert
df[convert2chr] <- lapply(df[convert2chr], as.character)
## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "POSIXct" "POSIXt"
## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE" "DATE_OF_BIRTH"
## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]
## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## character(0)
## character(0)
head(df[,datecols])
## EXAM_DATE DATE_OF_BIRTH
## 1 2018-10-17 1933-05-18
## 2 2018-01-08 1952-07-06
## 3 2018-08-21 1958-10-31
## 4 2018-06-08 1957-10-06
## 5 2018-06-07 1949-07-20
## 6 2018-06-28 1946-01-30
## convert format
df[datecols] <- lapply(df[datecols], as.Date)
## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric" "character" "Date"
## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 40 vars
## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]
mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)
## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## ignore CANCER_TYPE, as it is a multiple values variable
## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 11 vars
## extract numeric variables from DD
## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]
mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)
unique(numColsfromDD$`Valid Responses`)
## [1] NA "1 thru 99999;" "1 thru 9999;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
DT::datatable(check_valid_numeric_responses(tmp,df))
## All numeric values are within valid ranges.
## ignore GP
MEDICAL_HIST <- df
df <- MINT_RC
info(MINT_RC,"SYSIND")
## #obs:3, cols:221, inds:3
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "logical"
##
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
## 'data.frame': 3 obs. of 221 variables:
## $ SYSXM : num 8247903 8300263 8342313
## $ SYSIND : num 11660243 11676853 11667133
## $ SYSGP : num 8011553 8017323 7946313
## $ SYSGPSTUDY : num 1458263 1464033 1387423
## $ SYSINDGP : num 8429313 8445923 8436203
## $ CGI_ORDER : num 1 1 1
## $ GPS_ORDER : num 1 1 1
## $ STDCGI_ORDER : num 11 11 11
## $ LSTUDY : chr "HAFS" "HAFS" "ADCONTROL"
## $ DB_OWNER : chr "CLINIC_USER" "CLINIC_USER" "CLINIC_USER"
## $ STUDY : chr "ALZ" "ALZ" "ALZ"
## $ SUBSTUDY : chr "HAFS" "HAFS" "ADCONTROL"
## $ CENTER : chr "IHG" "IHG" "IHG"
## $ GP : num 105805 105811 88254
## $ IND : num 1 1 9005
## $ REFCTR : logi NA NA NA
## $ EXAM_DATE : POSIXct, format: "2023-03-17" "2024-03-18" ...
## $ EXAMINER : chr "gsv32" "mxp1257" "gsv32"
## $ DATE_OF_BIRTH : POSIXct, format: "1955-09-07" "1950-03-22" ...
## $ AGE_AT_EXAM : num 67 73 62
## $ REVIEW_DATE : logi NA NA NA
## $ REVIEWER : logi NA NA NA
## $ MINT1A : logi NA NA NA
## $ MINT1B : num 1 1 1
## $ MINT1C : logi NA NA NA
## $ MINT1D : num 1 1 1
## $ MINT1F : logi NA NA NA
## $ BUTTERFLY_OTHER : logi NA NA NA
## $ MINT2A : logi NA NA NA
## $ MINT2B : num 1 1 1
## $ MINT2C : logi NA NA NA
## $ MINT2D : num 1 1 1
## $ MINT2F : logi NA NA NA
## $ GLOVE_OTHER : logi NA NA NA
## $ MINT3A : logi NA NA NA
## $ MINT3B : num 1 1 1
## $ MINT3C : logi NA NA NA
## $ MINT3D : num 1 1 1
## $ MINT3F : logi NA NA NA
## $ LIGHTBULB_OTHER : logi NA NA NA
## $ MINT4A : logi NA NA NA
## $ MINT4B : num 1 1 1
## $ MINT4C : logi NA NA NA
## $ MINT4D : num 1 1 1
## $ MINT4F : logi NA NA NA
## $ WATCH_OTHER : logi NA NA NA
## $ MINT5A : logi NA NA NA
## $ MINT5B : num 1 1 1
## $ MINT5C : logi NA NA NA
## $ MINT5D : num 1 1 1
## $ MINT5F : logi NA NA NA
## $ CANDLE_OTHER : logi NA NA NA
## $ MINT6A : logi NA NA NA
## $ MINT6B : num 1 1 1
## $ MINT6C : logi NA NA NA
## $ MINT6D : num 1 1 1
## $ MINT6F : logi NA NA NA
## $ CLOWN_OTHER : logi NA NA NA
## $ MINT7A : logi NA NA NA
## $ MINT7B : num 1 1 1
## $ MINT7C : logi NA NA NA
## $ MINT7D : num 1 1 1
## $ MINT7F : logi NA NA NA
## $ KITE_OTHER : logi NA NA NA
## $ MINT8A : logi NA NA NA
## $ MINT8B : num 1 1 1
## $ MINT8C : logi NA NA NA
## $ MINT8D : num 1 1 1
## $ MINT8F : logi NA NA NA
## $ RAINBOW_OTHER : logi NA NA NA
## $ MINT9A : logi NA NA NA
## $ MINT9B : num 1 1 1
## $ MINT9C : logi NA NA NA
## $ MINT9D : num 1 1 1
## $ MINT9F : logi NA NA NA
## $ WITCH_OTHER : logi NA NA NA
## $ MINT10A : logi NA NA NA
## $ MINT10B : num 1 1 1
## $ MINT10C : logi NA NA NA
## $ MINT10D : num 1 1 1
## $ MINT10F : logi NA NA NA
## $ SEESAW_OTHER : logi NA NA NA
## $ MINT11A : logi NA NA NA
## $ MINT11B : num 1 1 1
## $ MINT11C : logi NA NA NA
## $ MINT11D : num 1 1 1
## $ MINT11F : logi NA NA NA
## $ FLASHLIGHT_OTHER : logi NA NA NA
## $ MINT12A : logi NA NA NA
## $ MINT12B : num 1 1 1
## $ MINT12C : logi NA NA NA
## $ MINT12D : num 1 1 1
## $ MINT12F : logi NA NA NA
## $ PEACOCK_OTHER : logi NA NA NA
## $ MINT13A : logi NA NA NA
## $ MINT13B : num 1 1 1
## $ MINT13C : logi NA NA NA
## $ MINT13D : num 1 1 1
## $ MINT13F : logi NA NA NA
## $ SNAIL_OTHER : logi NA NA NA
## $ MINT14A : logi NA NA NA
## $ MINT14B : num 1 1 1
## $ MINT14C : logi NA NA NA
## $ MINT14D : num 1 1 1
## $ MINT14F : logi NA NA NA
## $ WHALE_OTHER : logi NA NA NA
## $ MINT15A : logi NA NA NA
## $ MINT15B : num 1 1 1
## $ MINT15C : logi NA NA NA
## $ MINT15D : num 1 1 1
## $ MINT15F : logi NA NA NA
## $ CAGE_OTHER : logi NA NA NA
## $ MINT16A : logi NA NA NA
## $ MINT16B : num 1 1 1
## $ MINT16C : logi NA NA NA
## $ MINT16D : num 1 1 1
## $ MINT16F : logi NA NA NA
## $ NEST_OTHER : logi NA NA NA
## $ MINT17A : logi NA NA NA
## $ MINT17B : num 1 1 1
## $ MINT17C : logi NA NA NA
## $ MINT17D : num 1 1 1
## $ MINT17F : logi NA NA NA
## $ PLUG_OTHER : logi NA NA NA
## $ MINT18A : logi NA NA NA
## $ MINT18B : num 1 1 1
## $ MINT18C : logi NA NA NA
## $ MINT18D : num 1 1 1
## $ MINT18F : logi NA NA NA
## $ WIG_OTHER : logi NA NA NA
## $ MINT19A : logi NA NA NA
## $ MINT19B : num 1 1 1
## $ MINT19C : logi NA NA NA
## $ MINT19D : num 1 1 1
## $ MINT19F : logi NA NA NA
## $ SCREW_OTHER : logi NA NA NA
## $ MINT20A : logi NA NA NA
## $ MINT20B : num 1 1 1
## $ MINT20C : logi NA NA NA
## $ MINT20D : num 1 1 1
## $ MINT20F : logi NA NA NA
## $ SCARF_OTHER : logi NA NA NA
## $ MINT21A : logi NA NA NA
## $ MINT21B : num 1 1 1
## $ MINT21C : logi NA NA NA
## $ MINT21D : num 1 1 1
## $ MINT21F : logi NA NA NA
## $ WELL_OTHER : logi NA NA NA
## $ MINT22A : logi NA NA NA
## $ MINT22B : num 1 1 1
## $ MINT22C : logi NA NA NA
## $ MINT22D : num 1 1 1
## $ MINT22F : logi NA NA NA
## $ DUSTPAN_OTHER : logi NA NA NA
## $ MINT23A : logi NA NA NA
## $ MINT23B : num 1 1 1
## $ MINT23C : logi NA NA NA
## $ MINT23D : num 1 1 1
## $ MINT23F : logi NA NA NA
## $ PARACHUTE_OTHER : logi NA NA NA
## $ MINT24A : num NA 1 NA
## $ MINT24B : num 1 NA 1
## $ MINT24C : num NA 1 NA
## $ MINT24D : num 1 1 1
## $ MINT24F : logi NA NA NA
## $ BLIND_OTHER : chr NA "BALLENA" NA
## $ MINT25A : logi NA NA NA
## $ MINT25B : num 1 1 1
## $ MINT25C : logi NA NA NA
## $ MINT25D : num 1 1 1
## $ MINT25F : logi NA NA NA
## $ HINGE_OTHER : logi NA NA NA
## $ MINT26A : logi NA NA NA
## $ MINT26B : num 1 1 1
## $ MINT26C : logi NA NA NA
## $ MINT26D : num 1 1 1
## $ MINT26F : logi NA NA NA
## $ FUNNEL_OTHER : logi NA NA NA
## $ MINT27A : num NA 1 NA
## $ MINT27B : num 1 NA 1
## $ MINT27C : num NA 1 NA
## $ MINT27D : num 1 1 1
## $ MINT27F : logi NA NA NA
## $ GAUGE_OTHER : chr NA "BISAGRA" NA
## $ MINT28A : num NA 1 NA
## $ MINT28B : num 1 NA 1
## $ MINT28C : num NA 0 NA
## $ MINT28D : num 1 0 1
## $ MINT28F : num NA 0 NA
## $ PORTHOLE_OTHER : chr NA "NONE" NA
## $ MINT29A : num 1 1 NA
## $ MINT29B : num NA NA 1
## $ MINT29C : num 0 1 NA
## $ MINT29D : num 0 1 1
## $ MINT29F : num 0 NA NA
## $ ANVIL_OTHER : chr "doesn't know" "yunque" NA
## $ MINT30A : logi NA NA NA
## $ MINT30B : num 1 1 1
## $ MINT30C : logi NA NA NA
## $ MINT30D : num 1 1 1
## $ MINT30F : logi NA NA NA
## $ MORTAR_OTHER : logi NA NA NA
## $ MINT31A : num NA 1 NA
## $ MINT31B : num 1 NA 1
## $ MINT31C : num NA 0 NA
## $ MINT31D : num 1 0 1
## $ MINT31F : num NA 0 NA
## $ PESTLE_OTHER : chr NA "none" NA
## $ MINT32A : logi NA NA NA
## $ MINT32B : num 1 1 1
## $ MINT32C : logi NA NA NA
## $ MINT32D : num 1 1 1
## $ MINT32F : logi NA NA NA
## $ AXLE_OTHER : logi NA NA NA
## $ COMMENTS_MINT : logi NA NA NA
## $ MINT_TOT_NO_CUE : num 31 27 32
## $ MINT_STIM_CUE : num 0 3 0
## $ MINT_PHON_CUE : num 1 2 0
## $ MINT_CORR_PHON_CUE : num 0 0 0
## $ MINT_CORR_STIM_CUE : num 31 30 32
## $ MINT_CORR_STIM_CUE_STATUS: logi NA NA NA
dfDD <- read_excel(revisedDDpath, sheet = "MINT_RC")
## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)] ## 115 vars
## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 115 × 2
## VarNames `Data Type`
## <chr> <chr>
## 1 REFCTR VARCHAR2(6)
## 2 REVIEW_DATE date
## 3 REVIEWER CHAR
## 4 MINT1A NUMBER(1)
## 5 MINT1C NUMBER(1)
## 6 MINT1F NUMBER(1)
## 7 BUTTERFLY_OTHER CHAR
## 8 MINT2A NUMBER(1)
## 9 MINT2C NUMBER(1)
## 10 MINT2F NUMBER(1)
## # ℹ 105 more rows
## converted to character
convert2chr <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("CHAR", dfDD$`Data Type`,ignore.case = T)] ## 31 vars
convert2date <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date", dfDD$`Data Type`,,ignore.case = T)] ## 1 var
convert2num <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("NUMBER", dfDD$`Data Type`,,ignore.case = T)] ## 83 vars
## convert
df[convert2chr] <- lapply(df[convert2chr], as.character)
df[convert2date] <- lapply(df[convert2date], as.Date)
df[convert2num] <- lapply(df[convert2date], as.numeric)
## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "POSIXct" "POSIXt"
##
## [[4]]
## [1] "Date"
## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE" "DATE_OF_BIRTH"
## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]
## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## REVIEW_DATE, ignore it, it has been corrected in previous step
## [1] "REVIEW_DATE"
head(df[,datecols])
## EXAM_DATE DATE_OF_BIRTH
## 1 2023-03-17 1955-09-07
## 2 2024-03-18 1950-03-22
## 3 2023-01-18 1960-05-04
## convert format
df[datecols] <- lapply(df[datecols], as.Date)
## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric" "character" "Date"
## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 42 vars
## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]
mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)
## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## All values are within valid ranges.
## ignore EXAMINER
## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 176 vars
## extract numeric variables from DD
## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]
mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)
unique(numColsfromDD$`Valid Responses`)
## [1] NA "1 thru 99999;" "1 thru 9999;" "1;"
## [5] "0;\r\n1;" "1;\r\n0;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP
MINT_RC <- df
df <- MINT_SP_RC
info(MINT_SP_RC,"SYSIND")
## #obs:303, cols:221, inds:301
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "logical"
##
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
## 'data.frame': 303 obs. of 221 variables:
## $ SYSXM : num 8260003 8260193 8277393 8278083 8260823 ...
## $ SYSIND : num 11163453 11620563 11620453 11621333 11621203 ...
## $ SYSGP : num 7924953 8005633 8005523 8006293 8006163 ...
## $ SYSGPSTUDY : num 1363063 1452343 1452233 1453003 1452873 ...
## $ SYSINDGP : num 7926663 8389633 8389523 8390403 8390273 ...
## $ CGI_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ GPS_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ STDCGI_ORDER : num 11 11 11 11 11 11 11 11 11 11 ...
## $ LSTUDY : chr "ADFAMPRADI" "ADCONTROL" "ADCONTROL" "ADCONTROL" ...
## $ DB_OWNER : chr "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
## $ STUDY : chr "ALZ" "ALZ" "ALZ" "ALZ" ...
## $ SUBSTUDY : chr "ADFAMPRADI" "ADCONTROL" "ADCONTROL" "ADCONTROL" ...
## $ CENTER : chr "IHG" "IHG" "IHG" "IHG" ...
## $ GP : num 87923 104477 104476 104528 104455 ...
## $ IND : num 9000 1 1 1 1 1 1 1 1 1 ...
## $ REFCTR : logi NA NA NA NA NA NA ...
## $ EXAM_DATE : POSIXct, format: "2023-10-25" "2023-05-15" ...
## $ EXAMINER : chr "gsv32" "jjs2031" "jjs2031" "jjs2031" ...
## $ DATE_OF_BIRTH : POSIXct, format: "1967-06-15" "1949-12-01" ...
## $ AGE_AT_EXAM : num 56 73 73 86 81 77 67 80 74 73 ...
## $ REVIEW_DATE : logi NA NA NA NA NA NA ...
## $ REVIEWER : logi NA NA NA NA NA NA ...
## $ MINT1A_SP : num NA NA NA NA NA NA NA NA NA 1 ...
## $ MINT1B_SP : num 1 1 1 1 1 1 1 1 1 NA ...
## $ MINT1C_SP : num NA NA NA NA NA NA NA NA NA 0 ...
## $ MINT1D_SP : num 1 1 1 1 1 1 1 1 1 0 ...
## $ MINT1F_SP : num NA NA NA NA NA NA NA NA NA 1 ...
## $ TAMBOR_OTHER_SP : chr NA NA NA NA ...
## $ MINT2A_SP : num NA NA NA NA NA NA NA NA NA 1 ...
## $ MINT2B_SP : num 1 1 1 1 1 1 1 1 1 NA ...
## $ MINT2C_SP : num NA NA NA NA NA NA NA NA NA 0 ...
## $ MINT2D_SP : num 1 1 1 1 1 1 1 1 1 0 ...
## $ MINT2F_SP : num NA NA NA NA NA NA NA NA NA 0 ...
## $ GLOVE_OTHER_SP : chr NA NA NA NA ...
## $ MINT3A_SP : num NA NA NA NA NA NA NA NA NA 1 ...
## $ MINT3B_SP : num 1 1 1 1 1 1 1 1 1 NA ...
## $ MINT3C_SP : num NA NA NA NA NA NA NA NA NA 0 ...
## $ MINT3D_SP : num 1 1 1 1 1 1 1 1 1 0 ...
## $ MINT3F_SP : num NA NA NA NA NA NA NA NA NA 0 ...
## $ LIGHTBULB_OTHER_SP : chr NA NA NA NA ...
## $ MINT4A_SP : num NA NA NA 1 1 NA NA NA NA 1 ...
## $ MINT4B_SP : num 1 1 1 NA NA 1 1 1 1 NA ...
## $ MINT4C_SP : num NA NA NA 1 1 NA NA NA NA 0 ...
## $ MINT4D_SP : num 1 1 1 1 1 1 1 1 1 0 ...
## $ MINT4F_SP : num NA NA NA NA NA NA NA NA NA 0 ...
## $ WATCH_OTHER_SP : chr NA NA NA "-" ...
## $ MINT5A_SP : num NA NA NA NA NA NA NA NA NA 1 ...
## $ MINT5B_SP : num 1 1 1 1 1 1 1 1 1 NA ...
## $ MINT5C_SP : num NA NA NA NA NA NA NA NA NA 1 ...
## $ MINT5D_SP : num 1 1 1 1 1 1 1 1 1 1 ...
## $ MINT5F_SP : num NA NA NA NA NA NA NA NA NA NA ...
## $ CANDLE_OTHER_SP : chr NA NA NA NA ...
## $ MINT6A_SP : num NA NA NA NA NA NA NA NA NA 1 ...
## $ MINT6B_SP : num 1 1 1 1 1 1 1 1 1 NA ...
## $ MINT6C_SP : num NA NA NA NA NA NA NA NA NA 0 ...
## $ MINT6D_SP : num 1 1 1 1 1 1 1 1 1 0 ...
## $ MINT6F_SP : num NA NA NA NA NA NA NA NA NA 0 ...
## $ CLOWN_OTHER_SP : chr NA NA NA NA ...
## $ MINT7A_SP : num NA NA NA NA NA NA NA NA NA 1 ...
## $ MINT7B_SP : num 1 1 1 1 1 1 1 1 1 NA ...
## $ MINT7C_SP : num NA NA NA NA NA NA NA NA NA 0 ...
## $ MINT7D_SP : num 1 1 1 1 1 1 1 1 1 0 ...
## $ MINT7F_SP : num NA NA NA NA NA NA NA NA NA 1 ...
## $ KITE_OTHER_SP : chr NA NA NA NA ...
## $ MINT8A_SP : num NA NA NA NA NA NA NA NA NA 1 ...
## $ MINT8B_SP : num 1 1 1 1 1 1 1 1 1 NA ...
## $ MINT8C_SP : num NA NA NA NA NA NA NA NA NA 0 ...
## $ MINT8D_SP : num 1 1 1 1 1 1 1 1 1 0 ...
## $ MINT8F_SP : num NA NA NA NA NA NA NA NA NA 0 ...
## $ RAINBOW_OTHER_SP : chr NA NA NA NA ...
## $ MINT9A_SP : num NA NA NA NA NA NA NA NA NA 1 ...
## $ MINT9B_SP : num 1 1 1 1 1 1 1 1 1 NA ...
## $ MINT9C_SP : num NA NA NA NA NA NA NA NA NA 0 ...
## $ MINT9D_SP : num 1 1 1 1 1 1 1 1 1 0 ...
## $ MINT9F_SP : num NA NA NA NA NA NA NA NA NA 1 ...
## $ WITCH_OTHER_SP : chr NA NA NA NA ...
## $ MINT10A_SP : num NA NA NA NA NA NA NA NA NA 1 ...
## $ MINT10B_SP : num 1 1 1 1 1 1 1 1 1 NA ...
## $ MINT10C_SP : num NA NA NA NA NA NA NA NA NA 0 ...
## $ MINT10D_SP : num 1 1 1 1 1 1 1 1 1 0 ...
## $ MINT10F_SP : num NA NA NA NA NA NA NA NA NA 1 ...
## $ SEESAW_OTHER_SP : chr NA NA NA NA ...
## $ MINT11A_SP : num NA NA NA NA NA NA NA NA NA 1 ...
## $ MINT11B_SP : num 1 1 1 1 1 1 1 1 1 NA ...
## $ MINT11C_SP : num NA NA NA NA NA NA NA NA NA 1 ...
## $ MINT11D_SP : num 1 1 1 1 1 1 1 1 1 1 ...
## $ MINT11F_SP : num NA NA NA NA NA NA NA NA NA NA ...
## $ FLASHLIGHT_OTHER_SP : chr NA NA NA NA ...
## $ MINT12A_SP : num NA NA NA NA NA NA NA NA NA 1 ...
## $ MINT12B_SP : num 1 1 1 1 1 1 1 1 1 NA ...
## $ MINT12C_SP : num NA NA NA NA NA NA NA NA NA 0 ...
## $ MINT12D_SP : num 1 1 1 1 1 1 1 1 1 0 ...
## $ MINT12F_SP : num NA NA NA NA NA NA NA NA NA 0 ...
## $ PEACOCK_OTHER_SP : chr NA NA NA NA ...
## $ MINT13A_SP : num NA NA NA NA NA NA NA NA NA NA ...
## $ MINT13B_SP : num 1 1 1 1 1 1 1 1 1 1 ...
## $ MINT13C_SP : num NA NA NA NA NA NA NA NA NA NA ...
## $ MINT13D_SP : num 1 1 1 1 1 1 1 1 1 1 ...
## $ MINT13F_SP : num NA NA NA NA NA NA NA NA NA NA ...
## $ SNAIL_OTHER_SP : chr NA NA NA NA ...
## $ MINT14A_SP : num NA NA NA NA NA NA NA NA NA 1 ...
## $ MINT14B_SP : num 1 1 1 1 1 1 1 1 1 NA ...
## $ MINT14C_SP : num NA NA NA NA NA NA NA NA NA 0 ...
## $ MINT14D_SP : num 1 1 1 1 1 1 1 1 1 0 ...
## $ MINT14F_SP : num NA NA NA NA NA NA NA NA NA 1 ...
## $ WHALE_OTHER_SP : chr NA NA NA NA ...
## $ MINT15A_SP : num NA NA 1 NA 1 NA NA NA NA 1 ...
## $ MINT15B_SP : num 1 1 NA 1 NA 1 1 1 1 NA ...
## $ MINT15C_SP : num NA NA 1 NA 1 NA NA NA NA 1 ...
## $ MINT15D_SP : num NA NA 1 NA 1 NA NA NA NA 1 ...
## $ MINT15F_SP : num NA NA NA NA NA NA NA NA NA NA ...
## $ CAGE_OTHER_SP : chr NA NA "---" NA ...
## $ MINT16A_SP : num NA NA NA NA NA NA NA NA NA 1 ...
## $ MINT16B_SP : num 1 1 1 1 1 1 1 1 1 NA ...
## $ MINT16C_SP : num NA NA NA NA NA NA NA NA NA 0 ...
## $ MINT16D_SP : num 1 1 1 1 1 1 1 1 1 0 ...
## $ MINT16F_SP : num NA NA NA NA NA NA NA NA NA 1 ...
## $ NEST_OTHER_SP : chr NA NA NA NA ...
## $ MINT17A_SP : num NA NA NA NA NA NA NA NA NA 1 ...
## $ MINT17B_SP : num 1 1 1 1 1 1 1 1 1 NA ...
## $ MINT17C_SP : num NA NA NA NA NA NA NA NA NA 0 ...
## $ MINT17D_SP : num 1 1 1 1 1 1 1 1 1 0 ...
## $ MINT17F_SP : num NA NA NA NA NA NA NA NA NA 0 ...
## $ PLUG_OTHER_SP : chr NA NA NA NA ...
## $ MINT18A_SP : num NA NA NA NA NA NA NA NA NA 1 ...
## $ MINT18B_SP : num 1 1 1 1 1 1 1 1 1 NA ...
## $ MINT18C_SP : num NA NA NA NA NA NA NA NA NA 0 ...
## $ MINT18D_SP : num 1 1 1 1 1 1 1 1 1 0 ...
## $ MINT18F_SP : num NA NA NA NA NA NA NA NA NA 0 ...
## $ WIG_OTHER_SP : chr NA NA NA NA ...
## $ MINT19A_SP : num NA NA NA NA NA NA NA NA NA 1 ...
## $ MINT19B_SP : num 1 1 1 1 1 1 1 1 1 NA ...
## $ MINT19C_SP : num NA NA NA NA NA NA NA NA NA 0 ...
## $ MINT19D_SP : num 1 1 1 1 1 1 1 1 1 0 ...
## $ MINT19F_SP : num NA NA NA NA NA NA NA NA NA 1 ...
## $ SCREW_OTHER_SP : chr NA NA NA NA ...
## $ MINT20A_SP : num NA NA NA NA NA NA NA NA NA 1 ...
## $ MINT20B_SP : num 1 1 1 1 1 1 1 1 1 NA ...
## $ MINT20C_SP : num NA NA NA NA NA NA NA NA NA 0 ...
## $ MINT20D_SP : num 1 1 1 1 1 1 1 1 1 0 ...
## $ MINT20F_SP : num NA NA NA NA NA NA NA NA NA 0 ...
## $ SCARF_OTHER_SP : chr NA NA NA NA ...
## $ MINT21A_SP : num NA NA NA NA NA NA NA NA NA NA ...
## $ MINT21B_SP : num 1 1 1 1 1 1 1 1 1 1 ...
## $ MINT21C_SP : num NA NA NA NA NA NA NA NA NA NA ...
## $ MINT21D_SP : num 1 1 1 1 1 1 1 1 1 1 ...
## $ MINT21F_SP : num NA NA NA NA NA NA NA NA NA NA ...
## $ WELL_OTHER_SP : chr NA NA NA NA ...
## $ MINT22A_SP : num NA NA NA NA NA NA NA NA NA 1 ...
## $ MINT22B_SP : num 1 1 1 1 1 1 1 1 1 NA ...
## $ MINT22C_SP : num NA NA NA NA NA NA NA NA NA 0 ...
## $ MINT22D_SP : num 1 1 1 1 1 1 1 1 1 0 ...
## $ MINT22F_SP : num NA NA NA NA NA NA NA NA NA 0 ...
## $ DUSTPAN_OTHER_SP : chr NA NA NA NA ...
## $ MINT23A_SP : num NA NA NA NA NA NA NA NA NA 1 ...
## $ MINT23B_SP : num 1 1 1 1 1 1 1 1 1 NA ...
## $ MINT23C_SP : num NA NA NA NA NA NA NA NA NA 0 ...
## $ MINT23D_SP : num 1 1 1 1 1 1 1 1 1 0 ...
## $ MINT23F_SP : num NA NA NA NA NA NA NA NA NA 1 ...
## $ PARACHUTE_OTHER_SP : chr NA NA NA NA ...
## $ MINT24A_SP : num NA NA NA NA NA NA NA NA NA 1 ...
## $ MINT24B_SP : num 1 1 1 1 1 1 1 1 1 NA ...
## $ MINT24C_SP : num NA NA NA NA NA NA NA NA NA 0 ...
## $ MINT24D_SP : num 1 1 1 1 1 1 1 1 1 0 ...
## $ MINT24F_SP : num NA NA NA NA NA NA NA NA NA 0 ...
## $ BLIND_OTHER_SP : chr NA NA NA NA ...
## $ MINT25A_SP : num NA NA NA NA NA NA 1 NA NA 1 ...
## $ MINT25B_SP : num 1 1 1 1 1 1 NA 1 1 NA ...
## $ MINT25C_SP : num NA NA NA NA NA NA 0 NA NA 1 ...
## $ MINT25D_SP : num 1 1 1 1 1 1 0 1 1 1 ...
## $ MINT25F_SP : num NA NA NA NA NA NA 1 NA NA NA ...
## $ HINGE_OTHER_SP : chr NA NA NA NA ...
## $ MINT26A_SP : num NA NA NA NA NA NA NA NA NA 1 ...
## $ MINT26B_SP : num 1 1 1 1 1 1 1 1 1 NA ...
## $ MINT26C_SP : num NA NA NA NA NA NA NA NA NA 1 ...
## $ MINT26D_SP : num 1 1 1 1 1 1 1 1 1 1 ...
## $ MINT26F_SP : num NA NA NA NA NA NA NA NA NA NA ...
## $ FUNNEL_OTHER_SP : chr NA NA NA NA ...
## $ MINT27A_SP : num NA NA NA NA NA NA NA NA NA 1 ...
## $ MINT27B_SP : num 1 1 1 1 1 1 1 1 1 NA ...
## $ MINT27C_SP : num NA NA NA NA NA NA NA NA NA 1 ...
## $ MINT27D_SP : num 1 1 1 1 1 1 1 1 1 1 ...
## $ MINT27F_SP : num NA NA NA NA NA NA NA NA NA NA ...
## $ GAUGE_OTHER_SP : chr NA NA NA NA ...
## $ MINT28A_SP : num 1 NA NA NA 1 NA NA NA NA 1 ...
## $ MINT28B_SP : num NA 1 1 1 NA 1 1 1 1 NA ...
## $ MINT28C_SP : num 0 NA NA NA 1 NA NA NA NA 1 ...
## $ MINT28D_SP : num 0 1 1 1 1 1 1 1 1 1 ...
## $ MINT28F_SP : num 1 NA NA NA NA NA NA NA NA NA ...
## $ PORTHOLE_OTHER_SP : chr "bisagra" NA NA NA ...
## $ MINT29A_SP : num 1 1 NA NA NA NA 1 NA NA NA ...
## $ MINT29B_SP : num NA NA 1 1 1 1 NA 1 1 1 ...
## $ MINT29C_SP : num 0 0 NA NA NA NA 0 NA NA NA ...
## $ MINT29D_SP : num 0 0 1 1 1 1 0 1 1 1 ...
## $ MINT29F_SP : num 0 0 NA NA NA NA 0 NA NA NA ...
## $ ANVIL_OTHER_SP : chr "n/a" NA NA NA ...
## $ MINT30A_SP : num NA 1 NA NA NA NA NA NA NA 1 ...
## $ MINT30B_SP : num 1 NA 1 1 1 1 1 1 1 NA ...
## $ MINT30C_SP : num NA 0 NA NA NA NA NA NA NA 0 ...
## $ MINT30D_SP : num 1 0 1 1 1 1 1 1 1 0 ...
## $ MINT30F_SP : num NA 0 NA NA NA NA NA NA NA 0 ...
## $ MORTAR_OTHER_SP : chr NA NA NA NA ...
## $ MINT31A_SP : num 1 1 NA NA NA NA 1 NA NA 1 ...
## $ MINT31B_SP : num NA NA 1 1 1 1 NA 1 1 NA ...
## $ MINT31C_SP : num 0 0 NA NA NA NA 0 NA NA 0 ...
## $ MINT31D_SP : num 0 0 1 1 1 1 0 1 1 0 ...
## $ MINT31F_SP : num 0 0 NA NA NA NA 0 NA NA 0 ...
## $ PESTLE_OTHER_SP : chr "n/a" NA NA NA ...
## $ MINT32A_SP : num NA NA NA NA NA NA 1 NA NA 1 ...
## $ MINT32B_SP : num 1 1 1 1 1 1 NA 1 1 NA ...
## $ MINT32C_SP : num NA NA NA NA NA NA 1 NA NA 0 ...
## $ MINT32D_SP : num 1 1 1 1 1 1 1 1 1 0 ...
## $ MINT32F_SP : num NA NA NA NA NA NA NA NA NA 0 ...
## $ AXLE_OTHER_SP : chr NA NA NA NA ...
## $ COMMENTS_MINT_SP : chr NA NA NA NA ...
## $ MINT_TOT_NO_CUE_SP : num 29 29 31 31 29 32 28 32 32 3 ...
## $ MINT_STIM_CUE_SP : num 0 0 1 1 3 0 1 0 0 7 ...
## $ MINT_PHON_CUE_SP : num 3 3 0 0 0 0 3 0 0 22 ...
## $ MINT_CORR_PHON_CUE_SP : num 1 0 0 0 0 0 1 0 0 8 ...
## $ MINT_CORR_STIM_CUE_SP : num 29 29 32 32 32 32 29 32 32 10 ...
## $ MINT_CORR_STIM_CUE_SP_STATUS: chr NA NA NA NA ...
dfDD <- read_excel(revisedDDpath, sheet = "MINT_SP_RC")
## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]
## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 3 × 2
## VarNames `Data Type`
## <chr> <chr>
## 1 REFCTR VARCHAR2(6)
## 2 REVIEW_DATE date
## 3 REVIEWER CHAR
## converted to character
convert2chr <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("CHAR", dfDD$`Data Type`,ignore.case = T)] ## 2 vars
convert2date <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date", dfDD$`Data Type`,,ignore.case = T)] ## 1 var
## convert
df[convert2chr] <- lapply(df[convert2chr], as.character)
df[convert2date] <- lapply(df[convert2date], as.Date)
## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "POSIXct" "POSIXt"
##
## [[4]]
## [1] "Date"
## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE" "DATE_OF_BIRTH"
## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]
## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## REVIEW_DATE, ignore it, it has been corrected in previous step
## [1] "REVIEW_DATE"
head(df[,datecols])
## EXAM_DATE DATE_OF_BIRTH
## 1 2023-10-25 1967-06-15
## 2 2023-05-15 1949-12-01
## 3 2023-05-15 1950-04-02
## 4 2023-05-09 1936-05-22
## 5 2023-02-24 1941-10-04
## 6 2023-08-11 1946-06-19
## convert format
df[datecols] <- lapply(df[datecols], as.Date)
## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric" "character" "Date"
## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 42 vars
## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]
mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)
## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## All values are within valid ranges.
## ignore EXAMINER
## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 176 vars
## extract numeric variables from DD
## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]
mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)
unique(numColsfromDD$`Valid Responses`)
## [1] NA "1 thru 99999;" "1 thru 9999;" "1;"
## [5] "0;\r\n1;" "1;\r\n0;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP
MINT_SP_RC <- df
df <- MOCA_RC
info(MOCA_RC,"SYSIND")
## #obs:585, cols:140, inds:580
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "logical"
##
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
## 'data.frame': 585 obs. of 140 variables:
## $ SYSXM : num 8258783 8258823 8259093 8260053 8260123 ...
## $ SYSIND : num 11037673 11369813 11024163 11620563 11362953 ...
## $ SYSGP : num 7894423 7952013 7889113 8005633 7946353 ...
## $ SYSGPSTUDY : num 1309743 1397123 1304453 1452343 1387463 ...
## $ SYSINDGP : num 7793413 8139083 7779783 8389633 8132223 ...
## $ CGI_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ GPS_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ STDCGI_ORDER : num 11 11 11 11 11 11 11 11 11 11 ...
## $ LSTUDY : chr "ADFAMPRADI" "ADCRLPRADI" "ADFAMPRADI" "ADCONTROL" ...
## $ DB_OWNER : chr "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
## $ STUDY : chr "ALZ" "ALZ" "ALZ" "ALZ" ...
## $ SUBSTUDY : chr "ADFAMPRADI" "ADCRLPRADI" "ADFAMPRADI" "ADCONTROL" ...
## $ CENTER : chr "IHG" "IHG" "IHG" "IHG" ...
## $ GP : num 87650 88301 87536 104477 87545 ...
## $ IND : num 9000 1 112 1 106 ...
## $ REFCTR : logi NA NA NA NA NA NA ...
## $ EXAM_DATE : POSIXct, format: "2023-10-24" "2024-02-13" ...
## $ EXAMINER : chr "gsv32" "jjs2031" "jjs2031" "jjs2031" ...
## $ DATE_OF_BIRTH : POSIXct, format: "1954-10-29" "1947-05-13" ...
## $ AGE_AT_EXAM : num 68 76 79 73 66 81 86 73 81 60 ...
## $ REVIEW_DATE : logi NA NA NA NA NA NA ...
## $ REVIEWER : logi NA NA NA NA NA NA ...
## $ MOCALOC : num 2 2 3 3 2 2 3 2 2 2 ...
## $ MOCALOC_OTHER : chr NA NA NA NA ...
## $ MOCALAN : num 2 2 2 2 2 2 2 2 2 2 ...
## $ MOCALANX : logi NA NA NA NA NA NA ...
## $ MOCATRAI : num 0 1 1 1 0 1 1 1 1 1 ...
## $ MOCACUBE : num 0 0 1 0 0 0 0 0 1 1 ...
## $ MOCACLOC : num 1 1 1 1 1 1 1 1 1 1 ...
## $ MOCACLON : num 1 0 1 1 1 1 1 0 0 1 ...
## $ MOCACLOH : num 1 0 1 1 0 0 0 1 1 0 ...
## $ MOCANAMI_LION : num 1 1 1 1 1 1 1 1 1 1 ...
## $ MOCANAMI_LION_OTH : chr NA NA NA NA ...
## $ MOCANAMI_RHINO : num 1 0 0 1 1 1 1 1 1 1 ...
## $ MOCANAMI_RHINO_OTH : chr NA NA NA NA ...
## $ MOCANAMI_CAMEL : num 1 1 1 1 1 1 1 0 1 1 ...
## $ MOCANAMI_CAMEL_OTH : chr NA NA NA NA ...
## $ MOCAREGI1 : num 0 0 1 1 1 1 0 1 1 1 ...
## $ MOCAREGI2 : num 1 1 1 0 1 0 0 1 1 1 ...
## $ MOCAREGI3 : num 1 1 0 0 0 1 1 1 1 0 ...
## $ MOCAREGI4 : num 1 1 1 1 1 1 1 1 1 1 ...
## $ MOCAREGI5 : num 1 0 1 1 1 1 1 0 0 1 ...
## $ MOCAREGI6 : num 1 1 1 1 1 0 1 1 1 1 ...
## $ MOCAREGI7 : num 0 1 1 1 1 1 1 1 1 1 ...
## $ MOCAREGI8 : num 1 1 1 1 1 1 1 1 0 1 ...
## $ MOCAREGI9 : num 1 1 0 1 1 1 0 1 1 1 ...
## $ MOCAREGI10 : num 1 1 1 1 1 1 0 1 1 1 ...
## $ MOCADIGI_FORW : num 1 1 1 1 1 0 0 1 1 1 ...
## $ MOCADIGI_FORW_INCORRECT : chr NA NA NA NA ...
## $ MOCADIGI_BACK : num 1 1 1 0 1 1 0 0 1 1 ...
## $ MOCADIGI_BACK_INCORRECT : chr NA NA NA NA ...
## $ MOCALETT : num 1 1 1 1 1 1 1 1 1 1 ...
## $ MOCASER7_93 : num 1 0 1 1 1 1 1 1 1 1 ...
## $ MOCASER7_93_OTH : logi NA NA NA NA NA NA ...
## $ MOCASER7_86 : num 0 0 1 1 0 1 1 0 1 1 ...
## $ MOCASER7_86_OTH : logi NA NA NA NA NA NA ...
## $ MOCASER7_79 : num 1 0 1 1 1 1 1 0 0 1 ...
## $ MOCASER7_79_OTH : logi NA NA NA NA NA NA ...
## $ MOCASER7_72 : num 1 0 1 1 1 1 1 0 1 1 ...
## $ MOCASER7_72_OTH : logi NA NA NA NA NA NA ...
## $ MOCASER7_65 : num 0 0 1 1 0 0 1 0 0 1 ...
## $ MOCASER7_65_OTH : logi NA NA NA NA NA NA ...
## $ MOCAREPE_1 : num 1 1 1 1 1 1 1 1 1 1 ...
## $ MOCAREPE_2 : num 1 0 1 1 0 0 0 1 1 1 ...
## $ MOCAFLUEF_60SEC : chr "finca, feo, flor, farmacia, fosforo, freno, ficticio" "FRACCION FAMILIA FRACCION (X) FUERTE" "Feo Fricos Farmacia Faro" "FALSO FEO FRIALDAD FENOMENO FACILIDAD FELICIDAD" ...
## $ MOCAFLUE_SCORE : num 0 0 0 0 1 0 0 1 1 1 ...
## $ MOCAABST_TRAIN : num 1 1 1 1 0 0 1 1 1 1 ...
## $ MOCAABST_RULER : num 1 1 1 1 0 1 0 1 1 1 ...
## $ MOCARECN_1 : num 1 NA NA NA 1 NA NA NA NA NA ...
## $ MOCARECN_2 : num 1 NA NA NA NA NA NA NA 1 1 ...
## $ MOCARECN_3 : num 1 NA NA NA 1 NA 1 NA NA 1 ...
## $ MOCARECN_4 : num NA NA NA NA NA NA NA NA NA NA ...
## $ MOCARECN_5 : num NA NA NA NA NA NA 1 NA 1 NA ...
## $ MOCARECC_1 : num NA NA NA 1 NA NA NA NA NA NA ...
## $ MOCARECC_2 : num NA NA NA 1 1 NA 1 NA NA NA ...
## $ MOCARECC_3 : num NA NA NA NA NA NA NA NA NA NA ...
## $ MOCARECC_4 : num NA NA NA 1 NA NA 1 NA NA NA ...
## $ MOCARECC_5 : num 1 1 NA 1 NA 1 NA NA NA NA ...
## $ MOCARECR_1 : num NA 1 1 NA NA NA 1 1 1 1 ...
## $ MOCARECR_2 : num NA 1 1 NA NA NA NA 1 NA NA ...
## $ MOCARECR_3 : num NA 1 1 1 NA 1 NA 1 1 NA ...
## $ MOCARECR_4 : num 1 1 1 NA 1 1 NA NA 1 1 ...
## $ MOCARECR_5 : num NA NA 1 NA 1 NA NA NA NA 1 ...
## $ MOCARECN_REC1 : num NA NA NA NA NA 0 NA NA NA NA ...
## $ MOCARECN_REC2 : num NA NA NA NA NA 0 NA NA NA NA ...
## $ MOCARECN_REC3 : num NA NA NA NA NA NA NA NA NA NA ...
## $ MOCARECN_REC4 : num NA NA NA NA NA NA NA 0 NA NA ...
## $ MOCARECN_REC5 : num NA NA NA NA NA NA NA 0 NA NA ...
## $ MOCAORDT_ENTRY : POSIXct, format: "2023-10-23" "2024-02-13" ...
## $ MOCAORDT : num 0 1 1 1 0 1 1 1 1 1 ...
## $ MOCAORMO_ENTRY : chr NA NA NA NA ...
## $ MOCAORMO : num 1 1 1 1 1 1 1 1 1 1 ...
## $ MOCAORYR_ENTRY : chr NA NA NA NA ...
## $ MOCAORYR : num 1 1 1 1 0 1 1 1 1 1 ...
## $ MOCAORDY_ENTRY : chr NA NA NA NA ...
## $ MOCAORDY : num 1 1 1 1 1 1 1 1 1 1 ...
## $ MOCAORPL_ENTRY : chr NA NA NA NA ...
## $ MOCAORPL : num 1 1 1 1 1 1 1 1 1 1 ...
## $ MOCAORCT_ENTRY : chr NA NA NA NA ...
## $ MOCAORCT : num 1 1 1 1 1 1 1 1 1 1 ...
## $ MOCA_EDU : num 1 0 0 0 1 1 0 0 0 0 ...
## $ NACC_MOCA : chr NA NA NA NA ...
## $ MOCAVISEXE_SCORE : num 3 2 5 4 2 3 3 3 4 4 ...
## $ MOCAVISEXE_SCORE_STATUS : chr NA NA NA NA ...
## $ MOCANAMI_SCORE : num 3 2 2 3 3 3 3 2 3 3 ...
## $ MOCANAMI_SCORE_STATUS : chr NA NA NA NA ...
## $ SCORE_REGISTRATION : num 8 8 8 8 9 8 6 9 8 9 ...
## $ SCORE_REGISTRATION_STATUS : logi NA NA NA NA NA NA ...
## $ MOCADIGI_SCORE : num 2 2 2 1 2 1 0 1 2 2 ...
## $ MOCADIGI_SCORE_STATUS : logi NA NA NA NA NA NA ...
## $ MOCASER7_93_SCORE : num 1 0 1 1 1 1 1 1 1 1 ...
## $ MOCASER7_93_SCORE_STATUS : logi NA NA NA NA NA NA ...
## $ MOCASER7_86_SCORE : num 0 0 1 1 0 1 1 0 1 1 ...
## $ MOCASER7_86_SCORE_SCORE_STATUS: logi NA NA NA NA NA NA ...
## $ MOCASER7_79_SCORE : num 1 0 1 1 1 1 1 0 0 1 ...
## $ MOCASER7_79_SCORE_STATUS : logi NA NA NA NA NA NA ...
## $ MOCASER7_72_SCORE : num 1 0 1 1 1 1 1 0 1 1 ...
## $ MOCASER7_72_SCORE_STATUS : logi NA NA NA NA NA NA ...
## $ MOCASER7_65_SCORE : num 0 0 1 1 0 0 1 0 0 1 ...
## $ MOCASER7_65_SCORE_STATUS : logi NA NA NA NA NA NA ...
## $ MOCASER7_SCORE : num 3 0 5 5 3 4 5 1 3 5 ...
## $ MOCASER7_SCORE_STATUS : chr NA NA NA NA ...
## $ MOCASER7_POINTSCORE : num 2 0 3 3 2 3 3 1 2 3 ...
## $ MOCASER7_POINTSCORE_STATUS : chr NA NA NA NA ...
## $ MOCAREPE_SCORE : num 2 1 2 2 1 1 1 2 2 2 ...
## $ MOCAREPE_SCORE_STATUS : logi NA NA NA NA NA NA ...
## $ MOCAABST_SCORE : num 2 2 2 2 0 1 1 2 2 2 ...
## $ MOCAABST_SCORE_STATUS : logi NA NA NA NA NA NA ...
## $ MOCARECN_SCORE : num 3 0 0 0 2 0 2 0 2 2 ...
## $ MOCARECN_SCORE_STATUS : chr "partial" NA NA NA ...
## $ MOCARECC_SCORE : num 1 1 0 4 1 1 2 0 0 0 ...
## $ MOCARECC_SCORE_STATUS : chr "partial" "partial" NA "partial" ...
## $ MOCARECR_SCORE : num 1 4 5 1 2 2 1 3 3 3 ...
## $ MOCARECR_SCORE_STATUS : chr "partial" "partial" NA "partial" ...
## $ MOCAOR_SCORE : num 5 6 6 6 4 6 6 6 6 6 ...
## $ MOCAOR_SCORE_STATUS : chr NA NA NA NA ...
## $ MOCATOTS : num 23 16 23 22 18 19 20 19 25 26 ...
## $ MOCATOTS_STATUS : chr "partial" "partial" "partial" "partial" ...
## $ NACCMOCA : num 24 16 23 22 19 20 20 19 25 26 ...
## $ NACCMOCA_STATUS : chr "partial" "partial" "partial" "partial" ...
dfDD <- read_excel(revisedDDpath, sheet = "MOCA_RC")
## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]
## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 18 × 2
## VarNames `Data Type`
## <chr> <chr>
## 1 REFCTR VARCHAR2(6)
## 2 REVIEW_DATE date
## 3 REVIEWER CHAR
## 4 MOCALANX VARCHAR2(25)
## 5 MOCASER7_93_OTH VARCHAR2(5)
## 6 MOCASER7_86_OTH VARCHAR2(5)
## 7 MOCASER7_79_OTH VARCHAR2(5)
## 8 MOCASER7_72_OTH VARCHAR2(5)
## 9 MOCASER7_65_OTH VARCHAR2(5)
## 10 SCORE_REGISTRATION_STATUS CHAR
## 11 MOCADIGI_SCORE_STATUS CHAR
## 12 MOCASER7_93_SCORE_STATUS CHAR
## 13 MOCASER7_86_SCORE_SCORE_STATUS CHAR
## 14 MOCASER7_79_SCORE_STATUS CHAR
## 15 MOCASER7_72_SCORE_STATUS CHAR
## 16 MOCASER7_65_SCORE_STATUS CHAR
## 17 MOCAREPE_SCORE_STATUS CHAR
## 18 MOCAABST_SCORE_STATUS CHAR
## select the vars to be converted to date
convert2date <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date",dfDD$`Data Type`,ignore.case = T)] ## REVIEW_DATE
## the rest should be converted to character
convert2chr <- setdiff(logicols,c(convert2num,convert2date)) ## 17 vars
## convert
df[convert2date] <- lapply(df[convert2date], as.Date)
df[convert2chr] <- lapply(df[convert2chr], as.character)
## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "POSIXct" "POSIXt"
##
## [[4]]
## [1] "Date"
## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE" "DATE_OF_BIRTH" "MOCAORDT_ENTRY"
## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date","Date")]
## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## [1] "REVIEW_DATE" can ignore REVIEW_DATE, as it has been corrected in previous step
## [1] "REVIEW_DATE"
head(df[,datecols])
## EXAM_DATE DATE_OF_BIRTH MOCAORDT_ENTRY
## 1 2023-10-24 1954-10-29 2023-10-23
## 2 2024-02-13 1947-05-13 2024-02-13
## 3 2023-10-25 1944-07-28 2023-10-25
## 4 2023-05-15 1949-12-01 2023-05-15
## 5 2024-02-20 1957-08-05 2024-02-20
## 6 2024-02-15 1942-09-30 2024-02-15
## convert format
df[datecols] <- lapply(df[datecols], as.Date)
## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric" "character" "Date"
## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 46 vars
## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]
mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)
## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## All values are within valid ranges.
## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 90 vars
## extract numeric variables from DD
## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]
mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)
unique(numColsfromDD$`Valid Responses`)
## [1] NA "1 thru 99999;" "1 thru 9999;" "1;\r\n2;\r\n3;"
## [5] "1;\r\n0;" "1;\r\n0;\r\n" "1;" "0;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP
MOCA_RC <- df
df <- NUMBER_SPAN_RC
info(NUMBER_SPAN_RC,"SYSIND")
## #obs:527, cols:85, inds:522
## extract all the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "logical"
##
## [[4]]
## [1] "POSIXct" "POSIXt"
str(df, max.level = 99, list.len = 99999)
## 'data.frame': 527 obs. of 85 variables:
## $ SYSXM : num 8276493 8258843 8258873 8260113 8277623 ...
## $ SYSIND : num 11369703 11369813 11037673 11620563 11435853 ...
## $ SYSGP : num 7951913 7952013 7894423 8005633 7962813 ...
## $ SYSGPSTUDY : num 1397023 1397123 1309743 1452343 1407923 ...
## $ SYSINDGP : num 8138973 8139083 7793413 8389633 8205123 ...
## $ CGI_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ GPS_ORDER : num 1 1 1 1 1 1 1 1 1 1 ...
## $ STDCGI_ORDER : num 11 11 11 11 11 11 11 11 11 11 ...
## $ LSTUDY : chr "ADCRLPRADI" "ADCRLPRADI" "ADFAMPRADI" "ADCONTROL" ...
## $ DB_OWNER : chr "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" "CLINIC_USER" ...
## $ STUDY : chr "ALZ" "ALZ" "ALZ" "ALZ" ...
## $ SUBSTUDY : chr "ADCRLPRADI" "ADCRLPRADI" "ADFAMPRADI" "ADCONTROL" ...
## $ CENTER : chr "IHG" "IHG" "IHG" "IHG" ...
## $ GP : num 88299 88301 87650 104477 88452 ...
## $ IND : num 1 1 9000 1 1 1 105 1 1 1 ...
## $ REFCTR : logi NA NA NA NA NA NA ...
## $ EXAM_DATE : POSIXct, format: "2024-02-13" "2024-02-13" ...
## $ EXAMINER : chr "gsv32" "jjs2031" "gsv32" "jjs2031" ...
## $ DATE_OF_BIRTH : POSIXct, format: "1944-09-22" "1947-05-13" ...
## $ AGE_AT_EXAM : num 79 76 68 73 81 86 71 73 81 79 ...
## $ REVIEW_DATE : logi NA NA NA NA NA NA ...
## $ REVIEWER : logi NA NA NA NA NA NA ...
## $ SPF3_R1 : chr "184" NA NA "184" ...
## $ SPF3_1 : num 1 1 1 1 1 1 1 1 1 1 ...
## $ SPF3_R2 : chr "279" NA NA "279" ...
## $ SPF3_2 : num 1 1 1 1 1 1 1 1 1 1 ...
## $ SPF4_R1 : chr "4162" NA NA "4162" ...
## $ SPF4_1 : num 1 1 1 1 1 1 1 1 1 1 ...
## $ SPF4_R2 : chr "8195" NA NA "8195" ...
## $ SPF4_2 : num 1 1 1 1 0 1 0 1 1 1 ...
## $ SPF5_R1 : chr "64928" NA NA "64928" ...
## $ SPF5_1 : num 1 1 1 1 0 1 0 0 1 0 ...
## $ SPF5_R2 : chr "73861" NA NA "73861" ...
## $ SPF5_2 : num 1 1 1 1 0 1 0 1 0 1 ...
## $ SPF6_R1 : chr "392475" "39245" NA "392475" ...
## $ SPF6_1 : num 1 0 1 1 NA 0 NA 0 0 0 ...
## $ SPF6_R2 : chr "628319" "628399" NA "628319" ...
## $ SPF6_2 : num 1 0 1 1 NA 0 NA 0 0 0 ...
## $ SPF7_R1 : chr "9687156" NA NA "9647153" ...
## $ SPF7_1 : num 0 NA 0 1 NA NA NA NA NA NA ...
## $ SPF7_R2 : chr "749281" NA NA "7492681" ...
## $ SPF7_2 : num 0 NA 0 1 NA NA NA NA NA NA ...
## $ SPF8_R1 : chr NA NA NA "47528169" ...
## $ SPF8_1 : num NA NA NA 0 NA NA NA NA NA NA ...
## $ SPF8_R2 : chr NA NA NA "29753618" ...
## $ SPF8_2 : num NA NA NA 0 NA NA NA NA NA NA ...
## $ SPF9_R1 : chr NA NA NA NA ...
## $ SPF9_1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ SPF9_R2 : chr NA NA NA NA ...
## $ SPF9_2 : num NA NA NA NA NA NA NA NA NA NA ...
## $ SPF_LONGEST : num 6 5 6 10 4 5 4 4 5 4 ...
## $ SPB2_R1 : chr "52" NA NA "52" ...
## $ SPB2_1 : num 1 1 1 1 1 1 1 1 1 1 ...
## $ SPB2_R2 : chr "74" NA NA "74" ...
## $ SPB2_2 : num 1 1 1 1 1 1 1 1 1 1 ...
## $ SPB3_R1 : chr "926" NA NA "692" ...
## $ SPB3_1 : num 0 1 1 1 0 1 0 1 0 0 ...
## $ SPB3_R2 : chr "473" NA NA "473" ...
## $ SPB3_2 : num 1 1 1 1 1 1 0 0 0 0 ...
## $ SPB4_R1 : chr "6761" NA NA "68176" ...
## $ SPB4_1 : num 0 1 0 0 0 0 NA 0 NA NA ...
## $ SPB4_R2 : chr "1536" "351" NA "6315" ...
## $ SPB4_2 : num 0 0 0 0 0 0 NA 0 NA NA ...
## $ SPB5_R1 : chr NA "9162" NA NA ...
## $ SPB5_1 : num NA 0 NA NA NA NA NA NA NA NA ...
## $ SPB5_R2 : chr NA "61927" NA NA ...
## $ SPB5_2 : num NA 0 NA NA NA NA NA NA NA NA ...
## $ SPB6_R1 : chr NA NA NA NA ...
## $ SPB6_1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ SPB6_R2 : chr NA NA NA NA ...
## $ SPB6_2 : num NA NA NA NA NA NA NA NA NA NA ...
## $ SPB7_R1 : chr NA NA NA NA ...
## $ SPB7_1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ SPB7_R2 : chr NA NA NA NA ...
## $ SPB7_2 : num NA NA NA NA NA NA NA NA NA NA ...
## $ SPB8_R1 : chr NA NA NA NA ...
## $ SPB8_1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ SPB8_R2 : chr NA NA NA NA ...
## $ SPB8_2 : num NA NA NA NA NA NA NA NA NA NA ...
## $ SPB_LONGEST : num 3 4 3 4 3 3 2 3 2 2 ...
## $ COMMENTS_SPF_SPB : chr NA NA NA NA ...
## $ SPF_TOTALSCORE : num 8 6 8 10 3 6 3 5 5 5 ...
## $ SPF_TOTALSCORE_STATUS: chr "partial" "partial" "partial" "partial" ...
## $ SPB_TOTALSCORE : num 3 5 4 4 3 4 2 3 2 2 ...
## $ SPB_TOTALSCORE_STATUS: chr "partial" "partial" "partial" "partial" ...
dfDD <- read_excel(revisedDDpath, sheet = "NUMBER_SPAN_RC")
## extract all logical variables
logicols <- colnames(df)[sapply(df, is.logical)]
## view those variables in the regeneraed DD
dfDD[dfDD$VarNames %in% logicols,c("VarNames","Data Type")]
## # A tibble: 3 × 2
## VarNames `Data Type`
## <chr> <chr>
## 1 REFCTR VARCHAR2(6)
## 2 REVIEW_DATE date
## 3 REVIEWER CHAR
## select the vars to be converted to date
convert2date <- dfDD$VarNames[dfDD$VarNames %in% logicols & grepl("date",dfDD$`Data Type`,ignore.case = T)] ## REVIEW_DATE
## the rest should be converted to character
convert2chr <- setdiff(logicols,c(convert2num,convert2date)) ## [1] "REFCTR" "REVIEWER"
## convert
df[convert2date] <- lapply(df[convert2date], as.Date)
df[convert2chr] <- lapply(df[convert2chr], as.character)
## recheck the unique data types
unique(sapply(df, class))
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "POSIXct" "POSIXt"
##
## [[4]]
## [1] "Date"
## extract date variables from sub-dataset
datecols <- colnames(df)[sapply(df, function(x) inherits(x, c("POSIXct", "POSIXt")))]
## [1] "EXAM_DATE" "DATE_OF_BIRTH"
## extract date variables from regenerated DD
datecolsFromDD <- dfDD$VarNames[dfDD$`Data Type` %in% c("DATE","date")]
## compare the two to see if we missing any date variables
setdiff(datecols,datecolsFromDD) ## character(0)
## character(0)
setdiff(datecolsFromDD,datecols) ## [1] "REVIEW_DATE" can ignore REVIEW_DATE, as it has been corrected in previous step
## [1] "REVIEW_DATE"
head(df[,datecols])
## EXAM_DATE DATE_OF_BIRTH
## 1 2024-02-13 1944-09-22
## 2 2024-02-13 1947-05-13
## 3 2023-10-24 1954-10-29
## 4 2023-05-15 1949-12-01
## 5 2024-02-15 1942-09-30
## 6 2023-05-09 1936-05-22
## convert format
df[datecols] <- lapply(df[datecols], as.Date)
## recheck the unique data types
unique(sapply(df, class))
## [1] "numeric" "character" "Date"
## extract characteristic variables from sub-dataset
chrcols <- colnames(df)[sapply(df, is.character)] ## 39 vars
## check data type inconsistency:
## mismatchChrs_1: present as chr in data but others in the DD
## mismatchChrs_2: present as chr in DD but others in the data
chrColsfromDD <- dfDD[grepl("^(varchar|char)", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Data Type")]
mismatchChrs_1 <- setdiff(chrcols,chrColsfromDD$VarNames) ## character(0)
mismatchChrs_2 <- setdiff(chrColsfromDD$VarNames,chrcols) ## character(0)
## extract characteristic variables with value specification
tmp <- dfDD[grepl("CHAR|VARCHAR", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
## check if the unique values for the chr columns in the dataset matching with the DD
DT::datatable(check_valid_responses(tmp,df))
## All values are within valid ranges.
## extract numeric variables from sub-dataset
numcols <- colnames(df)[sapply(df, is.numeric)] ## 43 vars
## extract numeric variables from DD
## check data type inconsistency:
## mismatchNums_1: present as numeric in data but others in the DD
## mismatchNums_2: present as numeric in DD but others in the data
numColsfromDD <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE),c("VarNames","Valid Responses")]
mismatchNums_1 <- setdiff(numcols,numColsfromDD$VarNames) ## character(0)
mismatchNums_2 <- setdiff(numColsfromDD$VarNames,numcols) ## character(0)
unique(numColsfromDD$`Valid Responses`)
## [1] NA "1 thru 99999;" "1 thru 9999;" "1;\r\n0;"
tmp <- dfDD[grepl("number", dfDD$`Data Type`, ignore.case = TRUE) & !is.na(dfDD$`Valid Responses`),c("VarNames","Valid Responses")]
DT::datatable(check_valid_numeric_responses(tmp,df))
## ignore GP
NUMBER_SPAN_RC <- df
# Get names of all data frames in the environment
longDfwithDuplicates <- c()
otherwithDuplicates <- c()
index = 0
## following function will do:
## 1. filter out the okay cross-sectional datasets
## 2. for the rest: return the longitudinal/cross-sectional dataset names if duplicates got detected
for (df_name in df_names) {
df <- get(df_name)
## filter out the okay cross-sectional datasets
if (length(unique(df[["SYSIND"]])) == nrow(df)) {
index = index + 1
cat(index,
"No duplicates found in cross-sectional dataset: ",
df_name,
"\n")
} else{
## Check if columns ID and Visit exist (longitduinal data or not)
if (all(c("SYSIND", "EXAM_DATE") %in% colnames(df))) {
# Find duplicates using dplyr
dup_rows <- df %>%
dplyr::group_by(SYSIND, EXAM_DATE) %>%
dplyr::filter(n() > 1) %>%
dplyr::ungroup()
# If any duplicates found, assign to new data frame with _Duplicates
if (nrow(dup_rows) > 0) {
longDfwithDuplicates <- c(longDfwithDuplicates, df_name)
} else {
index = index + 1
cat(index,
"No duplicates found in longitudinal dataset: ",
df_name,
"\n")
}
} else {
otherwithDuplicates <- c(otherwithDuplicates, df_name)
}
}
}
## 1 No duplicates found in longitudinal dataset: AAAD_GERIAT
## 2 No duplicates found in longitudinal dataset: AAAD_MEDCON
## 3 No duplicates found in longitudinal dataset: AAAD_SOCIO_DEMO
## 4 No duplicates found in longitudinal dataset: AAAD_TRAILS
## 5 No duplicates found in longitudinal dataset: ALZ_B9_JUDGE_RC
## 6 No duplicates found in longitudinal dataset: ALZ_CSDD
## 7 No duplicates found in cross-sectional dataset: ALZ_GAI_SP
## 8 No duplicates found in longitudinal dataset: ALZ_NEURO_CDR
## 9 No duplicates found in cross-sectional dataset: ALZ_RPFQ
## 10 No duplicates found in longitudinal dataset: ALZ_SCREENING_RC
## 11 No duplicates found in longitudinal dataset: ALZ_STICK_D_RC
## 12 No duplicates found in longitudinal dataset: B4_CDR_RC
## 13 No duplicates found in longitudinal dataset: B5_NPIQ_RC
## 14 No duplicates found in longitudinal dataset: B6_GDS_RC
## 15 No duplicates found in longitudinal dataset: B7_FAS_RC
## 16 No duplicates found in cross-sectional dataset: BCF_RECOG_RC
## 17 No duplicates found in cross-sectional dataset: BCFCD_RC
## 18 No duplicates found in cross-sectional dataset: BCFCI_RC
## 19 No duplicates found in cross-sectional dataset: BILINGUAL_SCALE_RC
## 20 No duplicates found in longitudinal dataset: CAT_FLUENCY_RC
## 21 No duplicates found in cross-sectional dataset: CERAD_DEL_RC
## 22 No duplicates found in cross-sectional dataset: CERAD_IMM_RC
## 23 No duplicates found in cross-sectional dataset: CERAD_RECOG_RC
## 24 No duplicates found in longitudinal dataset: CRAFT_21_DEL_RC
## 25 No duplicates found in longitudinal dataset: CRAFT_21_IMM_RC
## 26 No duplicates found in longitudinal dataset: MEDCON_RC
## 27 No duplicates found in longitudinal dataset: MEDICAL_HIST
## 28 No duplicates found in cross-sectional dataset: MINT_RC
## 29 No duplicates found in longitudinal dataset: MINT_SP_RC
## 30 No duplicates found in longitudinal dataset: MOCA_RC
## 31 No duplicates found in longitudinal dataset: NUMBER_SPAN_RC
## longidtudinal datatset with duplicate
longDfwithDuplicates
## [1] "ALZ_NPIQ_CBRS"
## otherwithDuplicates
## the following variables do not have EXAM_DATE but have other time variables
## so I will check duplicates one by one for them based on their unique time variables
otherwithDuplicates
## [1] "ALZ_CLINICALSUM" "ALZ_EXAM" "ALZ_LOAD_COG" "ALZ_NCRAD"
## [5] "ALZ_SCREENING" "CONSENSUS_DX"
## check duplicates for ALZ_CLINICALSUM
ALZ_CLINICALSUM %>%
dplyr::group_by(SYSIND, FORM_DATE) %>%
dplyr::filter(n() > 1) %>%
dplyr::ungroup() %>%
nrow() %>% print() ## 0 row
## [1] 0
## check duplicates for ALZ_EXAM
ALZ_EXAM %>%
dplyr::group_by(SYSIND, FORM_DATE) %>%
dplyr::filter(n() > 1) %>%
dplyr::ungroup() %>%
nrow() %>% print() ## 0 row
## [1] 0
## check duplicates for ALZ_LOAD_COG
ALZ_LOAD_COG %>%
dplyr::group_by(SYSIND, INTERVIEW_DATE) %>%
dplyr::filter(n() > 1) %>%
dplyr::ungroup() %>%
nrow() %>% print() ## 0 row
## [1] 0
## check duplicates for ALZ_NCRAD
ALZ_NCRAD %>%
dplyr::group_by(SYSIND, FORM_DATE) %>%
dplyr::filter(n() > 1) %>%
dplyr::ungroup() %>%
nrow() %>% print() ## 2 rows
## [1] 2
## ALZ_SCREENING
ALZ_SCREENING %>%
dplyr::group_by(SYSIND, FORM_DATE) %>%
dplyr::filter(n() > 1) %>%
dplyr::ungroup() %>%
nrow() %>% print() ## 0 row
## [1] 0
## CONSENSUS_DX
CONSENSUS_DX %>%
dplyr::group_by(SYSIND, DATE_DX) %>%
dplyr::filter(n() > 1) %>%
dplyr::ungroup() %>%
nrow() %>% print() ## 211 rows
## [1] 211
cat("Before duplicates handling - SYSIND*EXAM_DATE is: ",dupFixCheck(ALZ_NPIQ_CBRS,"SYSIND","EXAM_DATE"),"\n")
## Before duplicates handling - SYSIND*EXAM_DATE is: 122
info(ALZ_NPIQ_CBRS,"SYSIND")
## #obs:123, cols:116, inds:121
## view the duplicates
ALZ_NPIQ_CBRS %>%
dplyr::group_by(SYSIND, EXAM_DATE) %>%
dplyr::filter(n() > 1) %>%
dplyr::ungroup()
## # A tibble: 2 × 116
## SYSXM SYSIND SYSGP SYSGPSTUDY SYSINDGP CGI_ORDER GPS_ORDER STDCGI_ORDER
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7540713 11048883 7896183 1311503 7804743 1 1 11
## 2 7540723 11048883 7896183 1311503 7804743 1 1 11
## # ℹ 108 more variables: LSTUDY <chr>, DB_OWNER <chr>, STUDY <chr>,
## # SUBSTUDY <chr>, CENTER <chr>, GP <dbl>, IND <dbl>, REFCTR <chr>,
## # EXAM_DATE <date>, EXAMINER <chr>, DATE_OF_BIRTH <date>, AGE_AT_EXAM <dbl>,
## # NPIQINF <chr>, NPIQINF_PRO <chr>, NPIQINF_OTH <chr>, NPIQINFA <dbl>,
## # NPIQINFB <dbl>, NPIQTYPE <dbl>, AGIT <dbl>, AGITSEV <dbl>,
## # AGITATION_DIST <dbl>, DEPD <dbl>, DEPDSEV <dbl>, DEPRESS_DIST <dbl>,
## # ANX <dbl>, ANXSEV <dbl>, ANXIETY_DIST <dbl>, ELAT <dbl>, ELATSEV <dbl>, …
## after checking the duplicates, I decided to keep the second obs as it has less missingness
ALZ_NPIQ_CBRS <- ALZ_NPIQ_CBRS[ALZ_NPIQ_CBRS$SYSXM != "7540713", ]
info(ALZ_NPIQ_CBRS,"SYSIND")
## #obs:122, cols:116, inds:121
cat("After duplicates handling - SYSIND*EXAM_DATE is: ",dupFixCheck(ALZ_NPIQ_CBRS,"SYSIND","EXAM_DATE"),"\n")
## After duplicates handling - SYSIND*EXAM_DATE is: 122
cat("Before duplicates handling - SYSIND*FORM_DATE is: ",dupFixCheck(ALZ_NCRAD,"SYSIND","FORM_DATE"),"\n")
## Before duplicates handling - SYSIND*FORM_DATE is: 742
info(ALZ_NCRAD,"SYSIND")
## #obs:743, cols:53, inds:742
## view the duplicates
ALZ_NCRAD %>%
dplyr::group_by(SYSIND, FORM_DATE) %>%
dplyr::filter(n() > 1) %>%
dplyr::ungroup()
## # A tibble: 2 × 53
## SYSXM SYSIND SYSGP SYSGPSTUDY SYSINDGP CGI_ORDER GPS_ORDER STDCGI_ORDER
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7906253 11009263 7889133 1304473 7764083 1 1 11
## 2 8388533 11009263 7889133 1304473 7764083 1 1 11
## # ℹ 45 more variables: LSTUDY <chr>, DB_OWNER <chr>, STUDY <chr>,
## # SUBSTUDY <chr>, CENTER <chr>, GP <dbl>, IND <dbl>, REFCTR <chr>,
## # QUALIFY <chr>, FORM_DATE <date>, FILLED_OUT_BY <chr>, DATE_OF_BIRTH <date>,
## # IN_NCRAD <chr>, SAMPLED <dbl>, EDUC <dbl>, VISIT <dbl>, COMREQ <dbl>,
## # NOTDEMCI <dbl>, EVALMETH <dbl>, EVALYR <dbl>, CLDEMLEW <dbl>,
## # COMDXAD <chr>, NONADDEM <dbl>, COMDXNAD <chr>, AAOSYMP <dbl>,
## # STROKETY <dbl>, STROKEAGE <dbl>, HYPERAGE <dbl>, HEARTAGE <dbl>, …
## the duplicates are exactly same, so we can randomly drop one, I will drop the first observation
ALZ_NCRAD <- ALZ_NCRAD[ALZ_NCRAD$SYSXM != "7906253", ]
info(ALZ_NCRAD,"SYSIND")
## #obs:742, cols:53, inds:742
cat("After duplicates handling - SYSIND*FORM_DATE is: ",dupFixCheck(ALZ_NCRAD,"SYSIND","FORM_DATE"),"\n")
## After duplicates handling - SYSIND*FORM_DATE is: 742
cat("Before duplicates handling - SYSIND*DATE_DX is: ",dupFixCheck(CONSENSUS_DX,"SYSIND","DATE_DX"),"\n")
## Before duplicates handling - SYSIND*DATE_DX is: 1700
info(CONSENSUS_DX,"SYSIND")
## #obs:1807, cols:43, inds:1584
## view the duplicates
dups_CONSENSUS_DX <- CONSENSUS_DX %>%
dplyr::group_by(SYSIND, DATE_DX) %>%
dplyr::filter(n() > 1) %>%
dplyr::ungroup()
info(dups_CONSENSUS_DX,"SYSIND")
## #obs:211, cols:43, inds:104
## some individuals have multiple CDX, and the RANK variable records the number of visits
## so I decided to use pivot_wider function to keep all the CDX values
## remove duplicates
IDcols <- c(names(CONSENSUS_DX)[1:16],"DATE_OF_BIRTH","DATE_DX",names(CONSENSUS_DX)[27:43])
IDcols
## [1] "SYSXM" "SYSIND" "SYSGP"
## [4] "SYSGPSTUDY" "SYSINDGP" "CGI_ORDER"
## [7] "GPS_ORDER" "STDCGI_ORDER" "LSTUDY"
## [10] "DB_OWNER" "STUDY" "SUBSTUDY"
## [13] "CENTER" "GP" "IND"
## [16] "REFCTR" "DATE_OF_BIRTH" "DATE_DX"
## [19] "CLINICAL_COMMENTS" "OTHER_TXT1" "OTHER_TXT2"
## [22] "OTHER_TXT3" "CALC_VAL1" "CALC_VAL2"
## [25] "CALC_VAL3" "CALC_VAL4" "CALC_VAL5"
## [28] "CALC_VAL6" "CALC_VAL7" "CALC_VAL8"
## [31] "CALC_VAL9" "CALC_VAL10" "CALC_VAL11"
## [34] "LAST_SOURCE" "OTHER_DATE1"
CONSENSUS_DX <- CONSENSUS_DX %>%
pivot_wider(
id_cols = all_of(IDcols),
names_from = RANK,
values_from = c(REVIEW_DATE, REVIEWER, RANK:WHO_DX,COMMENTS),
names_sep = "_"
)
info(CONSENSUS_DX,"SYSIND")
## #obs:1701, cols:59, inds:1584
cat("After duplicates handling - SYSIND*DATE_DX is: ",dupFixCheck(CONSENSUS_DX,"SYSIND","DATE_DX"),"\n")
## After duplicates handling - SYSIND*DATE_DX is: 1700
## get total number of unique inviduals
all_ids <- unlist(
lapply(df_names, function(d) get(d)$SYSIND)
)
# Count unique SYSINDs
all_ids <- unique(all_ids)
length(all_ids) ## 1994 individuals
## [1] 1994
# Initialize a matrix with colored HTML symbols
presence_mat <- sapply(df_names, function(dfn) {
df <- get(dfn) # get the dataset by name
present <- all_ids %in% df$SYSIND
ifelse(
present,
'<span style="color:darkgreen; font-weight:bold;">✔</span>',
'<span style="color:darkred; font-weight:bold;">✘</span>'
)
})
# Convert to data frame
presence_df <- data.frame(ID = all_ids, presence_mat, check.names = FALSE)
# columns to center (all except ID)
center_targets <- if (ncol(presence_df) > 1) 1:(ncol(presence_df) - 1) else integer(0)
DT::datatable(
presence_df,
escape = FALSE,
rownames = FALSE,
options = list(
pageLength = 50,
scrollX = TRUE,
columnDefs = list(
list(className = "dt-center", targets = center_targets) # 0-based indexing
)
)
)
summary_df <- do.call(rbind, lapply(df_names, function(d) {
df <- get(d)
df_individuals <- unique(df$SYSIND)
data.frame(
dataset = d,
n_individuals = length(df_individuals),
n_obs = nrow(df),
n_individials_missing = length(setdiff(all_ids, df_individuals)),
stringsAsFactors = FALSE
)
}))
DT::datatable(summary_df)
cross_dfs<- c()
long_dfs_wEXAM_DATE <- c()
long_dfs_woEXAM_DATE <- c()
for (df_name in df_names) {
df_obj <- get(df_name) # get the dataframe
if (nrow(df_obj) == length(unique(df_obj[["SYSIND"]]))) {
cross_dfs <- c(cross_dfs, df_name)
} else if ("EXAM_DATE" %in% names(df_obj)) {
long_dfs_wEXAM_DATE <- c(long_dfs_wEXAM_DATE, df_name)
} else{
long_dfs_woEXAM_DATE <- c(long_dfs_woEXAM_DATE,df_name)
}
}
cross_dfs
## [1] "ALZ_GAI_SP" "ALZ_NCRAD" "ALZ_RPFQ"
## [4] "BCF_RECOG_RC" "BCFCD_RC" "BCFCI_RC"
## [7] "BILINGUAL_SCALE_RC" "CERAD_DEL_RC" "CERAD_IMM_RC"
## [10] "CERAD_RECOG_RC" "MINT_RC"
long_dfs_wEXAM_DATE
## [1] "AAAD_GERIAT" "AAAD_MEDCON" "AAAD_SOCIO_DEMO" "AAAD_TRAILS"
## [5] "ALZ_B9_JUDGE_RC" "ALZ_CSDD" "ALZ_NEURO_CDR" "ALZ_NPIQ_CBRS"
## [9] "ALZ_SCREENING_RC" "ALZ_STICK_D_RC" "B4_CDR_RC" "B5_NPIQ_RC"
## [13] "B6_GDS_RC" "B7_FAS_RC" "CAT_FLUENCY_RC" "CRAFT_21_DEL_RC"
## [17] "CRAFT_21_IMM_RC" "MEDCON_RC" "MEDICAL_HIST" "MINT_SP_RC"
## [21] "MOCA_RC" "NUMBER_SPAN_RC"
long_dfs_woEXAM_DATE
## [1] "ALZ_CLINICALSUM" "ALZ_EXAM" "ALZ_LOAD_COG" "ALZ_SCREENING"
## [5] "CONSENSUS_DX"
dfwEXAM_DATE <- c()
dfwoEXAM_DATE <- c()
for (df_name in df_names) {
df_obj <- get(df_name) # get the dataframe
if ("EXAM_DATE" %in% names(df_obj)) {
dfwEXAM_DATE <- c(dfwEXAM_DATE, df_name)
} else{
dfwoEXAM_DATE <- c(dfwoEXAM_DATE,df_name)
}
}
print(dfwEXAM_DATE)
## [1] "AAAD_GERIAT" "AAAD_MEDCON" "AAAD_SOCIO_DEMO"
## [4] "AAAD_TRAILS" "ALZ_B9_JUDGE_RC" "ALZ_CSDD"
## [7] "ALZ_GAI_SP" "ALZ_NEURO_CDR" "ALZ_NPIQ_CBRS"
## [10] "ALZ_RPFQ" "ALZ_SCREENING_RC" "ALZ_STICK_D_RC"
## [13] "B4_CDR_RC" "B5_NPIQ_RC" "B6_GDS_RC"
## [16] "B7_FAS_RC" "BCF_RECOG_RC" "BCFCD_RC"
## [19] "BCFCI_RC" "BILINGUAL_SCALE_RC" "CAT_FLUENCY_RC"
## [22] "CERAD_DEL_RC" "CERAD_IMM_RC" "CERAD_RECOG_RC"
## [25] "CRAFT_21_DEL_RC" "CRAFT_21_IMM_RC" "MEDCON_RC"
## [28] "MEDICAL_HIST" "MINT_RC" "MINT_SP_RC"
## [31] "MOCA_RC" "NUMBER_SPAN_RC"
print(dfwoEXAM_DATE)
## [1] "ALZ_CLINICALSUM" "ALZ_EXAM" "ALZ_LOAD_COG" "ALZ_NCRAD"
## [5] "ALZ_SCREENING" "CONSENSUS_DX"
## detect which dataset has AGE_AT_EXAM variable
index = 1
for (df_name in df_names) {
df_obj <- get(df_name, inherits = TRUE)
nms <- names(df_obj)
has_age <- "AGE_AT_EXAM" %in% nms
has_date <- "EXAM_DATE" %in% nms
if (has_age && has_date) {
cat(index, ":", df_name, ": both EXAM_DATE and AGE_AT_EXAM present\n")
} else if (has_age && !has_date) {
cat(index, ":", df_name, ": only AGE_AT_EXAM present\n")
} else if (has_date && !has_age) {
cat(index, ":", df_name, ": only EXAM_DATE present\n")
} else {
cat(index, ":", df_name, ": ============ none of them present ============\n")
}
index <- index + 1
}
## 1 : AAAD_GERIAT : both EXAM_DATE and AGE_AT_EXAM present
## 2 : AAAD_MEDCON : both EXAM_DATE and AGE_AT_EXAM present
## 3 : AAAD_SOCIO_DEMO : both EXAM_DATE and AGE_AT_EXAM present
## 4 : AAAD_TRAILS : both EXAM_DATE and AGE_AT_EXAM present
## 5 : ALZ_B9_JUDGE_RC : both EXAM_DATE and AGE_AT_EXAM present
## 6 : ALZ_CLINICALSUM : ============ none of them present ============
## 7 : ALZ_CSDD : both EXAM_DATE and AGE_AT_EXAM present
## 8 : ALZ_EXAM : ============ none of them present ============
## 9 : ALZ_GAI_SP : both EXAM_DATE and AGE_AT_EXAM present
## 10 : ALZ_LOAD_COG : ============ none of them present ============
## 11 : ALZ_NCRAD : ============ none of them present ============
## 12 : ALZ_NEURO_CDR : both EXAM_DATE and AGE_AT_EXAM present
## 13 : ALZ_NPIQ_CBRS : both EXAM_DATE and AGE_AT_EXAM present
## 14 : ALZ_RPFQ : both EXAM_DATE and AGE_AT_EXAM present
## 15 : ALZ_SCREENING : ============ none of them present ============
## 16 : ALZ_SCREENING_RC : both EXAM_DATE and AGE_AT_EXAM present
## 17 : ALZ_STICK_D_RC : both EXAM_DATE and AGE_AT_EXAM present
## 18 : B4_CDR_RC : both EXAM_DATE and AGE_AT_EXAM present
## 19 : B5_NPIQ_RC : both EXAM_DATE and AGE_AT_EXAM present
## 20 : B6_GDS_RC : both EXAM_DATE and AGE_AT_EXAM present
## 21 : B7_FAS_RC : both EXAM_DATE and AGE_AT_EXAM present
## 22 : BCF_RECOG_RC : both EXAM_DATE and AGE_AT_EXAM present
## 23 : BCFCD_RC : both EXAM_DATE and AGE_AT_EXAM present
## 24 : BCFCI_RC : both EXAM_DATE and AGE_AT_EXAM present
## 25 : BILINGUAL_SCALE_RC : both EXAM_DATE and AGE_AT_EXAM present
## 26 : CAT_FLUENCY_RC : both EXAM_DATE and AGE_AT_EXAM present
## 27 : CERAD_DEL_RC : both EXAM_DATE and AGE_AT_EXAM present
## 28 : CERAD_IMM_RC : both EXAM_DATE and AGE_AT_EXAM present
## 29 : CERAD_RECOG_RC : both EXAM_DATE and AGE_AT_EXAM present
## 30 : CONSENSUS_DX : ============ none of them present ============
## 31 : CRAFT_21_DEL_RC : both EXAM_DATE and AGE_AT_EXAM present
## 32 : CRAFT_21_IMM_RC : both EXAM_DATE and AGE_AT_EXAM present
## 33 : MEDCON_RC : both EXAM_DATE and AGE_AT_EXAM present
## 34 : MEDICAL_HIST : both EXAM_DATE and AGE_AT_EXAM present
## 35 : MINT_RC : both EXAM_DATE and AGE_AT_EXAM present
## 36 : MINT_SP_RC : both EXAM_DATE and AGE_AT_EXAM present
## 37 : MOCA_RC : both EXAM_DATE and AGE_AT_EXAM present
## 38 : NUMBER_SPAN_RC : both EXAM_DATE and AGE_AT_EXAM present
Findings: AGE_AT_EXAM and EXAM_DATE variables are paired. If one presents, the other also presents.
## group by SYSIND and AGE_AT_EXAM then check duplicates
index = 1
for (df_name in df_names) {
df_obj <- get(df_name) # get the dataframe
if ("AGE_AT_EXAM" %in% names(df_obj)) {
cat(index, " :", df_name, " : AGE_AT_EXAM present", "\n")
dup_rows_AGE_AT_EXAM <- df_obj %>%
dplyr::group_by(SYSIND, AGE_AT_EXAM) %>%
dplyr::filter(n() > 1) %>%
dplyr::ungroup()
dup_rows_EXAM_DATE <- df_obj %>%
dplyr::group_by(SYSIND, EXAM_DATE) %>%
dplyr::filter(n() > 1) %>%
dplyr::ungroup()
if (nrow(dup_rows_AGE_AT_EXAM) > 0) {
cat(index,
" :",
df_name,
" : !!!duplicates present (checking by AGE_AT_EXAM )!!!",
"\n")
} else if (nrow(dup_rows_EXAM_DATE) > 0) {
cat(index,
" :",
df_name,
" : !!!duplicates present (checking by EXAM_DATE )!!!",
"\n")
} else{
cat(index, " :", df_name, " : no duplicates found:)", "\n")
}
} else{
cat(index, " :", df_name, "===============================================","\n")
}
index = index + 1
}
## 1 : AAAD_GERIAT : AGE_AT_EXAM present
## 1 : AAAD_GERIAT : !!!duplicates present (checking by AGE_AT_EXAM )!!!
## 2 : AAAD_MEDCON : AGE_AT_EXAM present
## 2 : AAAD_MEDCON : !!!duplicates present (checking by AGE_AT_EXAM )!!!
## 3 : AAAD_SOCIO_DEMO : AGE_AT_EXAM present
## 3 : AAAD_SOCIO_DEMO : no duplicates found:)
## 4 : AAAD_TRAILS : AGE_AT_EXAM present
## 4 : AAAD_TRAILS : no duplicates found:)
## 5 : ALZ_B9_JUDGE_RC : AGE_AT_EXAM present
## 5 : ALZ_B9_JUDGE_RC : no duplicates found:)
## 6 : ALZ_CLINICALSUM ===============================================
## 7 : ALZ_CSDD : AGE_AT_EXAM present
## 7 : ALZ_CSDD : !!!duplicates present (checking by AGE_AT_EXAM )!!!
## 8 : ALZ_EXAM ===============================================
## 9 : ALZ_GAI_SP : AGE_AT_EXAM present
## 9 : ALZ_GAI_SP : no duplicates found:)
## 10 : ALZ_LOAD_COG ===============================================
## 11 : ALZ_NCRAD ===============================================
## 12 : ALZ_NEURO_CDR : AGE_AT_EXAM present
## 12 : ALZ_NEURO_CDR : !!!duplicates present (checking by AGE_AT_EXAM )!!!
## 13 : ALZ_NPIQ_CBRS : AGE_AT_EXAM present
## 13 : ALZ_NPIQ_CBRS : no duplicates found:)
## 14 : ALZ_RPFQ : AGE_AT_EXAM present
## 14 : ALZ_RPFQ : no duplicates found:)
## 15 : ALZ_SCREENING ===============================================
## 16 : ALZ_SCREENING_RC : AGE_AT_EXAM present
## 16 : ALZ_SCREENING_RC : no duplicates found:)
## 17 : ALZ_STICK_D_RC : AGE_AT_EXAM present
## 17 : ALZ_STICK_D_RC : no duplicates found:)
## 18 : B4_CDR_RC : AGE_AT_EXAM present
## 18 : B4_CDR_RC : no duplicates found:)
## 19 : B5_NPIQ_RC : AGE_AT_EXAM present
## 19 : B5_NPIQ_RC : no duplicates found:)
## 20 : B6_GDS_RC : AGE_AT_EXAM present
## 20 : B6_GDS_RC : !!!duplicates present (checking by AGE_AT_EXAM )!!!
## 21 : B7_FAS_RC : AGE_AT_EXAM present
## 21 : B7_FAS_RC : no duplicates found:)
## 22 : BCF_RECOG_RC : AGE_AT_EXAM present
## 22 : BCF_RECOG_RC : no duplicates found:)
## 23 : BCFCD_RC : AGE_AT_EXAM present
## 23 : BCFCD_RC : no duplicates found:)
## 24 : BCFCI_RC : AGE_AT_EXAM present
## 24 : BCFCI_RC : no duplicates found:)
## 25 : BILINGUAL_SCALE_RC : AGE_AT_EXAM present
## 25 : BILINGUAL_SCALE_RC : no duplicates found:)
## 26 : CAT_FLUENCY_RC : AGE_AT_EXAM present
## 26 : CAT_FLUENCY_RC : no duplicates found:)
## 27 : CERAD_DEL_RC : AGE_AT_EXAM present
## 27 : CERAD_DEL_RC : no duplicates found:)
## 28 : CERAD_IMM_RC : AGE_AT_EXAM present
## 28 : CERAD_IMM_RC : no duplicates found:)
## 29 : CERAD_RECOG_RC : AGE_AT_EXAM present
## 29 : CERAD_RECOG_RC : no duplicates found:)
## 30 : CONSENSUS_DX ===============================================
## 31 : CRAFT_21_DEL_RC : AGE_AT_EXAM present
## 31 : CRAFT_21_DEL_RC : no duplicates found:)
## 32 : CRAFT_21_IMM_RC : AGE_AT_EXAM present
## 32 : CRAFT_21_IMM_RC : no duplicates found:)
## 33 : MEDCON_RC : AGE_AT_EXAM present
## 33 : MEDCON_RC : !!!duplicates present (checking by AGE_AT_EXAM )!!!
## 34 : MEDICAL_HIST : AGE_AT_EXAM present
## 34 : MEDICAL_HIST : !!!duplicates present (checking by AGE_AT_EXAM )!!!
## 35 : MINT_RC : AGE_AT_EXAM present
## 35 : MINT_RC : no duplicates found:)
## 36 : MINT_SP_RC : AGE_AT_EXAM present
## 36 : MINT_SP_RC : no duplicates found:)
## 37 : MOCA_RC : AGE_AT_EXAM present
## 37 : MOCA_RC : no duplicates found:)
## 38 : NUMBER_SPAN_RC : AGE_AT_EXAM present
## 38 : NUMBER_SPAN_RC : no duplicates found:)
Findings: when mergeing, AGE_AT_EXAM can not be used as the key column, cause duplicates existed for some people with different AGE_AT_EXAM after rounding up, those obs have same AGE_AT_EXAM values.
##########################################################################################
## for every individual, get all their date data
ppdat <- data.frame(SYSIND = numeric(0), EXAM_DATE = as.Date(character(0)))
## for dataset with EXAM_DATE, we just need to extract the relevant information and appending to the ppdat
for (df_name in dfwEXAM_DATE) {
df_obj <- get(df_name) # get the dataframe
df_obj <- df_obj[,c("SYSIND","EXAM_DATE")]
ppdat <- rbind(ppdat,df_obj)
}
ppdat <- ppdat[!duplicated(ppdat),]
## for dataset without EXAM_DATE, extract other data variables and appending to the ppdat
## ALZ_CLINICALSUM: has FORM_DATE
## ALZ_EXAM: has FORM_DATE
## ALZ_LOAD_COG: has INTERVIEW_DATE
## ALZ_NCRAD: has FORM_DATE
## ALZ_SCREENING: has FORM_DATE
## CONSENSUS_DX: DATE_DX
dfwFORM_DATE <- c("ALZ_CLINICALSUM","ALZ_EXAM","ALZ_NCRAD","ALZ_SCREENING")
for (df_name in dfwFORM_DATE) {
df_obj <- get(df_name) # get the dataframe
df_obj <- df_obj[,c("SYSIND","FORM_DATE")]
names(df_obj) <- c("SYSIND","EXAM_DATE")
ppdat <- rbind(ppdat,df_obj)
}
ppdat <- ppdat[!duplicated(ppdat),]
df_obj <- ALZ_LOAD_COG
df_obj <- df_obj[,c("SYSIND","INTERVIEW_DATE")]
names(df_obj) <- c("SYSIND","EXAM_DATE")
ppdat <- rbind(ppdat,df_obj)
ppdat <- ppdat[!duplicated(ppdat),]
df_obj <- CONSENSUS_DX
df_obj <- df_obj[,c("SYSIND","DATE_DX")]
names(df_obj) <- c("SYSIND","EXAM_DATE")
ppdat <- rbind(ppdat,df_obj)
ppdat <- ppdat[!duplicated(ppdat),]
info(ppdat,"SYSIND") #obs:5204, cols:2, inds:1994
## #obs:5204, cols:2, inds:1994
sorted_ppdat <- ppdat %>% arrange(SYSIND, EXAM_DATE)
##########################################################################################
## check the time range between visits and get the distribution plot
# Step 1: Calculate intervals per individual
sorted_ppdat_intervals <- sorted_ppdat %>%
group_by(SYSIND) %>%
mutate(interval_days = as.numeric(difftime(EXAM_DATE, lag(EXAM_DATE), units = "days"))) %>%
mutate(interval_month = interval_days/30.436875) %>%
ungroup()
# Step 2: Look at the intervals (excluding first visit per subject which is NA)
DT::datatable(sorted_ppdat_intervals)
# Step 3: Plot histogram of all intervals
hist(sorted_ppdat_intervals$interval_month)

summary(sorted_ppdat_intervals$interval_month)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0329 2.4641 8.8872 12.3731 18.7930 96.0020 1994
##########################################################################################
## creating the Visit Index variable
## If a subject’s next exam is < 3 months after the last one, it stays in the same visit.
## If it’s ≥ 3 months, it becomes the next visit.
## Starts at 1 for each subject.
sorted_ppdat2 <- sorted_ppdat %>%
group_by(SYSIND) %>%
mutate(
new_visit = is.na(lag(EXAM_DATE)) |
EXAM_DATE >= (lag(EXAM_DATE) %m+% months(3)),
Visit_Index = cumsum(new_visit) ## start a new visit when the gap from the previous exam is ≥ 3 months
) %>%
select(-new_visit) %>%
ungroup()
## Visit summary
visit_summary <- sorted_ppdat2 %>%
group_by(SYSIND) %>%
summarise(
total_visits = n_distinct(Visit_Index),
total_obs = n(), # how many raw rows for this person
first_date = min(EXAM_DATE, na.rm = TRUE),
last_date = max(EXAM_DATE, na.rm = TRUE),
span_days = as.integer(last_date - first_date)
) %>%
arrange(desc(total_visits))
DT::datatable(visit_summary)
## add Visit index back to each dataset
## for dataset with EXAM_DATE variable:
index = 1
for (df_name in dfwEXAM_DATE) {
cat("======================================================================","\n")
df_obj <- get(df_name) # get the dataframe
cat(index,": Currently processing dataset: ",df_name,"\n")
cat("Dataset dimension: ","\n")
info(df_obj,"SYSIND")
df_obj <- merge(df_obj,sorted_ppdat2,by=c("SYSIND","EXAM_DATE"))
cat("After adding visit index, dataset dimension: ","\n")
info(df_obj,"SYSIND")
## change dataset name
newdfname <- paste0("wVisitIndex_",df_name)
assign(newdfname, df_obj)
index = index + 1
}
## ======================================================================
## 1 : Currently processing dataset: AAAD_GERIAT
## Dataset dimension:
## #obs:1051, cols:62, inds:939
## After adding visit index, dataset dimension:
## #obs:1051, cols:63, inds:939
## ======================================================================
## 2 : Currently processing dataset: AAAD_MEDCON
## Dataset dimension:
## #obs:397, cols:256, inds:367
## After adding visit index, dataset dimension:
## #obs:397, cols:257, inds:367
## ======================================================================
## 3 : Currently processing dataset: AAAD_SOCIO_DEMO
## Dataset dimension:
## #obs:402, cols:161, inds:391
## After adding visit index, dataset dimension:
## #obs:402, cols:162, inds:391
## ======================================================================
## 4 : Currently processing dataset: AAAD_TRAILS
## Dataset dimension:
## #obs:439, cols:34, inds:428
## After adding visit index, dataset dimension:
## #obs:439, cols:35, inds:428
## ======================================================================
## 5 : Currently processing dataset: ALZ_B9_JUDGE_RC
## Dataset dimension:
## #obs:483, cols:82, inds:481
## After adding visit index, dataset dimension:
## #obs:483, cols:83, inds:481
## ======================================================================
## 6 : Currently processing dataset: ALZ_CSDD
## Dataset dimension:
## #obs:181, cols:42, inds:176
## After adding visit index, dataset dimension:
## #obs:181, cols:43, inds:176
## ======================================================================
## 7 : Currently processing dataset: ALZ_GAI_SP
## Dataset dimension:
## #obs:19, cols:42, inds:19
## After adding visit index, dataset dimension:
## #obs:19, cols:43, inds:19
## ======================================================================
## 8 : Currently processing dataset: ALZ_NEURO_CDR
## Dataset dimension:
## #obs:1221, cols:30, inds:1102
## After adding visit index, dataset dimension:
## #obs:1221, cols:31, inds:1102
## ======================================================================
## 9 : Currently processing dataset: ALZ_NPIQ_CBRS
## Dataset dimension:
## #obs:122, cols:116, inds:121
## After adding visit index, dataset dimension:
## #obs:122, cols:117, inds:121
## ======================================================================
## 10 : Currently processing dataset: ALZ_RPFQ
## Dataset dimension:
## #obs:132, cols:67, inds:132
## After adding visit index, dataset dimension:
## #obs:132, cols:68, inds:132
## ======================================================================
## 11 : Currently processing dataset: ALZ_SCREENING_RC
## Dataset dimension:
## #obs:556, cols:61, inds:552
## After adding visit index, dataset dimension:
## #obs:556, cols:62, inds:552
## ======================================================================
## 12 : Currently processing dataset: ALZ_STICK_D_RC
## Dataset dimension:
## #obs:430, cols:46, inds:428
## After adding visit index, dataset dimension:
## #obs:430, cols:47, inds:428
## ======================================================================
## 13 : Currently processing dataset: B4_CDR_RC
## Dataset dimension:
## #obs:599, cols:38, inds:592
## After adding visit index, dataset dimension:
## #obs:599, cols:39, inds:592
## ======================================================================
## 14 : Currently processing dataset: B5_NPIQ_RC
## Dataset dimension:
## #obs:305, cols:38, inds:304
## After adding visit index, dataset dimension:
## #obs:305, cols:39, inds:304
## ======================================================================
## 15 : Currently processing dataset: B6_GDS_RC
## Dataset dimension:
## #obs:543, cols:39, inds:539
## After adding visit index, dataset dimension:
## #obs:543, cols:40, inds:539
## ======================================================================
## 16 : Currently processing dataset: B7_FAS_RC
## Dataset dimension:
## #obs:435, cols:33, inds:431
## After adding visit index, dataset dimension:
## #obs:435, cols:34, inds:431
## ======================================================================
## 17 : Currently processing dataset: BCF_RECOG_RC
## Dataset dimension:
## #obs:266, cols:24, inds:266
## After adding visit index, dataset dimension:
## #obs:266, cols:25, inds:266
## ======================================================================
## 18 : Currently processing dataset: BCFCD_RC
## Dataset dimension:
## #obs:269, cols:38, inds:269
## After adding visit index, dataset dimension:
## #obs:269, cols:39, inds:269
## ======================================================================
## 19 : Currently processing dataset: BCFCI_RC
## Dataset dimension:
## #obs:270, cols:38, inds:270
## After adding visit index, dataset dimension:
## #obs:270, cols:39, inds:270
## ======================================================================
## 20 : Currently processing dataset: BILINGUAL_SCALE_RC
## Dataset dimension:
## #obs:240, cols:90, inds:240
## After adding visit index, dataset dimension:
## #obs:240, cols:91, inds:240
## ======================================================================
## 21 : Currently processing dataset: CAT_FLUENCY_RC
## Dataset dimension:
## #obs:555, cols:29, inds:550
## After adding visit index, dataset dimension:
## #obs:555, cols:30, inds:550
## ======================================================================
## 22 : Currently processing dataset: CERAD_DEL_RC
## Dataset dimension:
## #obs:177, cols:44, inds:177
## After adding visit index, dataset dimension:
## #obs:177, cols:45, inds:177
## ======================================================================
## 23 : Currently processing dataset: CERAD_IMM_RC
## Dataset dimension:
## #obs:188, cols:88, inds:188
## After adding visit index, dataset dimension:
## #obs:188, cols:89, inds:188
## ======================================================================
## 24 : Currently processing dataset: CERAD_RECOG_RC
## Dataset dimension:
## #obs:177, cols:48, inds:177
## After adding visit index, dataset dimension:
## #obs:177, cols:49, inds:177
## ======================================================================
## 25 : Currently processing dataset: CRAFT_21_DEL_RC
## Dataset dimension:
## #obs:523, cols:95, inds:519
## After adding visit index, dataset dimension:
## #obs:523, cols:96, inds:519
## ======================================================================
## 26 : Currently processing dataset: CRAFT_21_IMM_RC
## Dataset dimension:
## #obs:530, cols:98, inds:525
## After adding visit index, dataset dimension:
## #obs:530, cols:99, inds:525
## ======================================================================
## 27 : Currently processing dataset: MEDCON_RC
## Dataset dimension:
## #obs:627, cols:237, inds:618
## After adding visit index, dataset dimension:
## #obs:627, cols:238, inds:618
## ======================================================================
## 28 : Currently processing dataset: MEDICAL_HIST
## Dataset dimension:
## #obs:889, cols:53, inds:871
## After adding visit index, dataset dimension:
## #obs:889, cols:54, inds:871
## ======================================================================
## 29 : Currently processing dataset: MINT_RC
## Dataset dimension:
## #obs:3, cols:221, inds:3
## After adding visit index, dataset dimension:
## #obs:3, cols:222, inds:3
## ======================================================================
## 30 : Currently processing dataset: MINT_SP_RC
## Dataset dimension:
## #obs:303, cols:221, inds:301
## After adding visit index, dataset dimension:
## #obs:303, cols:222, inds:301
## ======================================================================
## 31 : Currently processing dataset: MOCA_RC
## Dataset dimension:
## #obs:585, cols:140, inds:580
## After adding visit index, dataset dimension:
## #obs:585, cols:141, inds:580
## ======================================================================
## 32 : Currently processing dataset: NUMBER_SPAN_RC
## Dataset dimension:
## #obs:527, cols:85, inds:522
## After adding visit index, dataset dimension:
## #obs:527, cols:86, inds:522
## Now add visit index to dataset with FORM_DATE
sorted_ppdat3 <- sorted_ppdat2
names(sorted_ppdat3)[names(sorted_ppdat3)=="EXAM_DATE"] <- "FORM_DATE"
for (df_name in dfwFORM_DATE) {
cat("======================================================================","\n")
df_obj <- get(df_name) # get the dataframe
cat(index,": Currently processing dataset: ",df_name,"\n")
cat("Dataset dimension: ","\n")
info(df_obj,"SYSIND")
df_obj <- merge(df_obj,sorted_ppdat3,by=c("SYSIND","FORM_DATE"))
cat("After adding visit index, dataset dimension: ","\n")
info(df_obj,"SYSIND")
## change dataset name
newdfname <- paste0("wVisitIndex_",df_name)
assign(newdfname, df_obj)
index = index + 1
}
## ======================================================================
## 33 : Currently processing dataset: ALZ_CLINICALSUM
## Dataset dimension:
## #obs:1484, cols:39, inds:1480
## After adding visit index, dataset dimension:
## #obs:1484, cols:40, inds:1480
## ======================================================================
## 34 : Currently processing dataset: ALZ_EXAM
## Dataset dimension:
## #obs:526, cols:80, inds:522
## After adding visit index, dataset dimension:
## #obs:526, cols:81, inds:522
## ======================================================================
## 35 : Currently processing dataset: ALZ_NCRAD
## Dataset dimension:
## #obs:742, cols:53, inds:742
## After adding visit index, dataset dimension:
## #obs:742, cols:54, inds:742
## ======================================================================
## 36 : Currently processing dataset: ALZ_SCREENING
## Dataset dimension:
## #obs:279, cols:49, inds:272
## After adding visit index, dataset dimension:
## #obs:279, cols:50, inds:272
## Finally, we add visit index to the last two datasets: ALZ_LOAD_COG and CONSENSUS_DX
cat("======================================================================","\n")
## ======================================================================
df_obj <- ALZ_LOAD_COG # get the dataframe
df_name <- "ALZ_LOAD_COG"
cat(index,": Currently processing dataset: ",df_name,"\n")
## 37 : Currently processing dataset: ALZ_LOAD_COG
cat("Dataset dimension: ","\n")
## Dataset dimension:
info(df_obj,"SYSIND")
## #obs:1006, cols:41, inds:907
df_obj <- merge(df_obj,sorted_ppdat2,by.x=c("SYSIND","INTERVIEW_DATE"),by.y = c("SYSIND","EXAM_DATE"))
cat("After adding visit index, dataset dimension: ","\n")
## After adding visit index, dataset dimension:
info(df_obj,"SYSIND")
## #obs:1006, cols:42, inds:907
## change dataset name
newdfname <- paste0("wVisitIndex_",df_name)
assign(newdfname, df_obj)
index = index + 1
cat("======================================================================","\n")
## ======================================================================
df_obj <- CONSENSUS_DX # get the dataframe
df_name <- "CONSENSUS_DX"
cat(index,": Currently processing dataset: ",df_name,"\n")
## 38 : Currently processing dataset: CONSENSUS_DX
cat("Dataset dimension: ","\n")
## Dataset dimension:
info(df_obj,"SYSIND")
## #obs:1701, cols:59, inds:1584
df_obj <- merge(df_obj,sorted_ppdat2,by.x=c("SYSIND","DATE_DX"),by.y = c("SYSIND","EXAM_DATE"))
cat("After adding visit index, dataset dimension: ","\n")
## After adding visit index, dataset dimension:
info(df_obj,"SYSIND")
## #obs:1701, cols:60, inds:1584
## change dataset name
newdfname <- paste0("wVisitIndex_",df_name)
assign(newdfname, df_obj)
df_names <- ls(pattern = "^wVisitIndex_")
length(df_names)
[1] 38
df_names_wDuplicates <- c()
for (df_name in df_names) {
df <- get(df_name)
df <- df %>%
dplyr::group_by(SYSIND, Visit_Index) %>%
dplyr::filter(n() > 1)
if (nrow(df) > 0) {
df_names_wDuplicates <- c(df_names_wDuplicates, df_name)
print( htmltools::tagList(DT::datatable(df,caption=df_name)))
}
}
cat("Datasets with duplicates after grouping by SYSIND and Visit_Index: ",
"\n")
Datasets with duplicates after grouping by SYSIND and Visit_Index:
print(df_names_wDuplicates)
[1] “wVisitIndex_AAAD_GERIAT” “wVisitIndex_ALZ_CLINICALSUM” [3]
“wVisitIndex_ALZ_CSDD” “wVisitIndex_ALZ_NEURO_CDR”
[5] “wVisitIndex_CONSENSUS_DX” “wVisitIndex_MEDICAL_HIST”